axi_adc_decimate: Re-implemented FIR filter

The minimum decimation rate of the CIC block is five, this means data arrives at the FIR filter at most every five clock cycles. The decimation rate of the filter is two so the filter produces an output at most every ten clock cycles. This allows for ten clock cycles to compute the result. The current implementation of the filter uses a fully pipelined architecture with one multiplier for each coefficient. Which then do work for one clock cycle and sit idle for the next nine clock cycles. Rework the filter to be sequential reducing the number of required multipliers to one. In addition exploit the symmetric structure of the filter to make use of the preadder reducing the required multiply operations by two. This significantly reduces the logic utilization of the filter as well as moderately reduces power consumption. Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
2017-04-03 18:54:00 +02:00 · 2017-04-03 18:54:00 +02:00 · 3e7325b29a
parent 737418a1b0
commit 3e7325b29a
1 changed files with 255 additions and 305 deletions
--- a/library/axi_adc_decimate/fir_decim.v
+++ b/library/axi_adc_decimate/fir_decim.v
@ -1,331 +1,281 @@
-// -------------------------------------------------------------
+// ***************************************************************************
+// ***************************************************************************
+// Copyright 2017(c) Analog Devices, Inc.
 //
-// Module: fir_decim
-// Generated by MATLAB(R) 9.0 and the Filter Design HDL Coder 3.0.
-// Generated on: 2016-07-05 15:45:22
-// -------------------------------------------------------------
-
-// -------------------------------------------------------------
-// HDL Code Generation Options:
+// All rights reserved.
 //
-// FIRAdderStyle: tree
-// OptimizeForHDL: on
-// EDAScriptGeneration: off
-// AddPipelineRegisters: on
-// Name: fir_decim
-// TargetLanguage: Verilog
-// TestBenchName: fo_copy_tb
-// TestBenchStimulus: step ramp chirp noise 
-// GenerateHDLTestBench: off
-
-// -------------------------------------------------------------
-// HDL Implementation    : Fully parallel
-// Multipliers           : 6
-// Folding Factor        : 1
-// -------------------------------------------------------------
-// Filter Settings:
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//     - Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     - Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in
+//       the documentation and/or other materials provided with the
+//       distribution.
+//     - Neither the name of Analog Devices, Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//     - The use of this software may or may not infringe the patent rights
+//       of one or more patent holders.  This license does not release you
+//       from the requirement that you obtain separate licenses from these
+//       patent holders to use this software.
+//     - Use of the software either in source or binary form, must be run
+//       on or directly connected to an Analog Devices Inc. component.
 //
-// Discrete-Time FIR Multirate Filter (real)
-// -----------------------------------------
-// Filter Structure   : Direct-Form FIR Polyphase Decimator
-// Decimation Factor  : 2
-// Polyphase Length   : 3
-// Filter Length      : 6
-// Stable             : Yes
-// Linear Phase       : Yes (Type 2)
+// THIS SOFTWARE IS PROVIDED BY ANALOG DEVICES "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+// INCLUDING, BUT NOT LIMITED TO, NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED.
 //
-// Arithmetic         : fixed
-// Numerator          : s12,11 -> [-1 1)
-// -------------------------------------------------------------
+// IN NO EVENT SHALL ANALOG DEVICES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, INTELLECTUAL PROPERTY
+// RIGHTS, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+// BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// ***************************************************************************
+// ***************************************************************************

 `timescale 1 ns / 1 ns

-module fir_decim
-               (
-                clk,
-                clk_enable,
-                reset,
-                filter_in,
-                filter_out,
-                ce_out
-                );
+module fir_decim #(
+  parameter USE_DSP48E = 1
+) (
+  input clk,
+  input clk_enable,
+  input reset,
+  input signed [11:0] filter_in,
+  output reg signed [25:0] filter_out,
+  output reg ce_out
+);

-  input   clk; 
-  input   clk_enable; 
-  input   reset; 
-  input   signed [11:0] filter_in; //sfix12_En11
-  output  signed [25:0] filter_out; //sfix26_En22
-  output  ce_out; 
+  localparam signed [11:0] coeffphase1_1 = 12'b000011010101; //sfix12_En11
+  localparam signed [11:0] coeffphase1_2 = 12'b011011110010; //sfix12_En11
+  localparam signed [11:0] coeffphase1_3 = 12'b110000111110; //sfix12_En11

-////////////////////////////////////////////////////////////////
-//Module Architecture: fir_decim
-////////////////////////////////////////////////////////////////
-  // Local Functions
-  // Type Definitions
-  // Constants
-  parameter signed [11:0] coeffphase1_1 = 12'b000011010101; //sfix12_En11
-  parameter signed [11:0] coeffphase1_2 = 12'b011011110010; //sfix12_En11
-  parameter signed [11:0] coeffphase1_3 = 12'b110000111110; //sfix12_En11
-  parameter signed [11:0] coeffphase2_1 = 12'b110000111110; //sfix12_En11
-  parameter signed [11:0] coeffphase2_2 = 12'b011011110010; //sfix12_En11
-  parameter signed [11:0] coeffphase2_3 = 12'b000011010101; //sfix12_En11
+  // We know that clk_enable is asserted at most every 5th clock cycle and the
+  // output is decimated by two. So we have 10 clock cycles to compute the
+  // result. That's plenty of time considering that there are only 6
+  // coefficients.

-  // Signals
-  reg  [1:0] ring_count; // ufix2
-  wire phase_0; // boolean
-  wire phase_1; // boolean
-  reg  ce_out_reg; // boolean
-  reg  signed [11:0] input_register; // sfix12_En11
-  reg  signed [11:0] input_pipeline_phase0 [0:1] ; // sfix12_En11
-  reg  signed [11:0] input_pipeline_phase1 [0:2] ; // sfix12_En11
-  wire signed [23:0] product_phase0_1; // sfix24_En22
-  wire signed [23:0] product_phase0_2; // sfix24_En22
-  wire signed [23:0] product_phase0_3; // sfix24_En22
-  wire signed [23:0] product_phase1_1; // sfix24_En22
-  wire signed [23:0] product_phase1_2; // sfix24_En22
-  wire signed [23:0] product_phase1_3; // sfix24_En22
-  reg  signed [23:0] product_pipeline_phase0_1; // sfix24_En22
-  reg  signed [23:0] product_pipeline_phase0_2; // sfix24_En22
-  reg  signed [23:0] product_pipeline_phase0_3; // sfix24_En22
-  reg  signed [23:0] product_pipeline_phase1_1; // sfix24_En22
-  reg  signed [23:0] product_pipeline_phase1_2; // sfix24_En22
-  reg  signed [23:0] product_pipeline_phase1_3; // sfix24_En22
-  wire signed [25:0] sumvector1 [0:2] ; // sfix26_En22
-  wire signed [23:0] add_signext; // sfix24_En22
-  wire signed [23:0] add_signext_1; // sfix24_En22
-  wire signed [24:0] add_temp; // sfix25_En22
-  wire signed [23:0] add_signext_2; // sfix24_En22
-  wire signed [23:0] add_signext_3; // sfix24_En22
-  wire signed [24:0] add_temp_1; // sfix25_En22
-  wire signed [23:0] add_signext_4; // sfix24_En22
-  wire signed [23:0] add_signext_5; // sfix24_En22
-  wire signed [24:0] add_temp_2; // sfix25_En22
-  reg  signed [25:0] sumdelay_pipeline1 [0:2] ; // sfix26_En22
-  wire signed [25:0] sumvector2 [0:1] ; // sfix26_En22
-  wire signed [25:0] add_signext_6; // sfix26_En22
-  wire signed [25:0] add_signext_7; // sfix26_En22
-  wire signed [26:0] add_temp_3; // sfix27_En22
-  reg  signed [25:0] sumdelay_pipeline2 [0:1] ; // sfix26_En22
-  wire signed [25:0] sum3; // sfix26_En22
-  wire signed [25:0] add_signext_8; // sfix26_En22
-  wire signed [25:0] add_signext_9; // sfix26_En22
-  wire signed [26:0] add_temp_4; // sfix27_En22
-  reg  ce_delayline1; // boolean
-  reg  ce_delayline2; // boolean
-  reg  ce_delayline3; // boolean
-  reg  ce_delayline4; // boolean
-  reg  ce_delayline5; // boolean
-  reg  ce_delayline6; // boolean
-  reg  ce_delayline7; // boolean
-  reg  ce_delayline8; // boolean
-  wire ce_gated; // boolean
-  reg  signed [25:0] output_register; // sfix26_En22
+  reg active = 1'b0;
+  reg active_d1 = 1'b0;
+  reg active_d2 = 1'b0;

-  // Block Statements
-  always @ (posedge clk or posedge reset)
-    begin: ce_output
-      if (reset == 1'b1) begin
-        ring_count <= 1;
+  reg [1:0] count = 2'b00;
+  reg phase = 1'b1;
+  reg ready = 1'b0;
+
+  reg [3:0] storage0[0:11];
+  reg [3:0] storage1[0:11];
+
+  reg signed [11:0] data0;
+  reg signed [11:0] data1;
+
+  reg signed [11:0] coeff;
+
+  wire signed [25:0] sum;
+
+  integer j;
+
+  initial begin
+    for (j = 0; j < 12; j = j + 1) begin
+      storage0[j] <= 'h00;
+      storage1[j] <= 'h00;
+    end
+  end
+
+  always @(posedge clk) begin
+    if (reset == 1'b1) begin
+      phase <= 1'b1;
+    end else begin
+      if (clk_enable == 1'b1) begin
+        phase <= phase + 1'b1;
      end
-      else begin
-                if (clk_enable == 1'b1) begin
-        ring_count <= {ring_count[0], ring_count[1]};
-              end
-            end
-    end // ce_output
+    end
+  end

-  assign  phase_0 = ring_count[0]  && clk_enable;
+  always @(posedge clk) begin
+    if (clk_enable == 1'b1 && phase == 1'b1) begin
+      active <= 1'b1;
+    end else if (count == 'h2) begin
+      active <= 1'b0;
+    end
+    active_d1 <= active;
+    active_d2 <= active_d1;
+  end

-  assign  phase_1 = ring_count[1]  && clk_enable;
+  always @(posedge clk) begin
+    if (active == 1'b1) begin
+      case (count)
+      'h2: count <= 'h0;
+      default: count <= count + 1'b1;
+      endcase
+    end
+  end

-  //   ------------------ CE Output Register ------------------
+  always @(posedge clk) begin
+    if (active_d1 == 1'b0 && active_d2 == 1'b1) begin
+      ready <= 1'b1;
+    end else begin
+      ready <= 1'b0;
+    end
+  end

-  always @ (posedge clk or posedge reset)
-    begin: ce_output_register
-      if (reset == 1'b1) begin
-        ce_out_reg <= 1'b0;
-      end
-      else begin
-          ce_out_reg <= phase_1;
-      end
-    end // ce_output_register
-
-  always @ (posedge clk or posedge reset)
-    begin: input_reg_process
-      if (reset == 1'b1) begin
-        input_register <= 0;
-      end
-      else begin
-        if (clk_enable == 1'b1) begin
-          input_register <= filter_in;
+  generate
+  genvar i;
+  for (i = 0; i < 12; i = i + 1) begin
+    always @(posedge clk) begin
+      if (clk_enable == 1'b1) begin
+        if (phase == 1'b0) begin
+          storage0[i] <= {storage0[i][2:0],filter_in[i]};
+        end
+        if (phase == 1'b1) begin
+          storage1[i] <= {storage1[i][2:0],filter_in[i]};
        end
      end
-    end // input_reg_process
+    end

-  always @( posedge clk or posedge reset)
-    begin: Delay_Pipeline_Phase0_process
-      if (reset == 1'b1) begin
-        input_pipeline_phase0[0] <= 0;
-        input_pipeline_phase0[1] <= 0;
+    always @(*) begin
+      data0[i] <= storage0[i][2-count];
+      data1[i] <= storage1[i][count];
+    end
+  end
+  endgenerate
+
+  always @(*) begin
+    case (count)
+    'h0: coeff <= coeffphase1_1;
+    'h1: coeff <= coeffphase1_2;
+    'h2: coeff <= coeffphase1_3;
+    default: coeff <= 'h00;
+    endcase
+  end
+
+  generate if (USE_DSP48E) begin
+    wire [47:0] _sum;
+    wire [6:0] opmode = {1'b0,active_d2,5'b00101};
+
+    // Can't exceed 26 bit.
+    assign sum = _sum[43:18];
+
+    // MAC with pre-adder
+    DSP48E1 #(
+      .ACASCREG (0),
+      .ADREG (1),
+      .ALUMODEREG (0),
+      .AREG (0),
+      .AUTORESET_PATDET ("NO_RESET"),
+      .A_INPUT ("DIRECT"),
+      .BCASCREG (1),
+      .BREG (1),
+      .B_INPUT ("DIRECT"),
+      .CARRYINREG (0),
+      .CARRYINSELREG (0),
+      .CREG (0),
+      .DREG (0),
+      .INMODEREG (0),
+      .MASK (48'h3fffffffffff),
+      .MREG (1),
+      .OPMODEREG (1),
+      .PATTERN (48'h000000000000),
+      .PREG (1),
+      .SEL_MASK ("MASK"),
+      .SEL_PATTERN ("PATTERN"),
+      .USE_DPORT ("TRUE"),
+      .USE_MULT ("MULTIPLY"),
+      .USE_PATTERN_DETECT ("NO_PATDET"),
+      .USE_SIMD ("ONE48"))
+  i_dsp_mac (
+    .CLK (clk),
+    .A ({5'h0,data0[11],data0,12'h0}), // MSB aligned to 24-bit, 25th bit signed extended
+    .B ({coeff,6'b0}),
+    .C (48'h00),
+    .D ({data1[11],data1,12'h0}),
+    .MULTSIGNIN (1'b0),
+    .CARRYIN (1'b0),
+    .CARRYCASCIN (1'b0),
+    .ACIN (30'h0),
+    .BCIN (18'h0),
+    .PCIN (48'h0),
+    .P (_sum),
+    .MULTSIGNOUT (),
+    .CARRYOUT (),
+    .CARRYCASCOUT (),
+    .ACOUT (),
+    .BCOUT (),
+    .PCOUT (),
+    .ALUMODE (4'b0000),
+    .CARRYINSEL (3'h0),
+    .INMODE (5'b00100),
+    .OPMODE (opmode),
+    .PATTERNBDETECT (),
+    .PATTERNDETECT (),
+    .OVERFLOW (),
+    .UNDERFLOW (),
+    .CEA1 (1'b0),
+    .CEA2 (1'b0),
+    .CEAD (active),
+    .CEALUMODE (1'b0),
+    .CEB1 (1'b0),
+    .CEB2 (active),
+    .CEC (1'b0),
+    .CECARRYIN (1'b0),
+    .CECTRL (active),
+    .CED (1'b0),
+    .CEINMODE (1'b0),
+    .CEM (active_d1),
+    .CEP (active_d2),
+    .RSTA (1'b0),
+    .RSTALLCARRYIN (1'b0),
+    .RSTALUMODE (1'b0),
+    .RSTB (1'b0),
+    .RSTC (1'b0),
+    .RSTCTRL (1'b0),
+    .RSTD (1'b0),
+    .RSTINMODE (1'b0),
+    .RSTM (1'b0),
+    .RSTP (1'b0)
+  );
+
+  end else begin
+    reg signed [25:0] _sum = 'h00;
+    reg signed [12:0] pre_adder;
+    reg signed [11:0] coeff_d1;
+    reg signed [23:0] product = 'h00;
+
+    assign sum = _sum;
+
+    always @(posedge clk) begin
+      if (active == 1'b1) begin
+        pre_adder <= data0 + data1;
+        coeff_d1 <= coeff;
      end
-      else begin
-        if (phase_1 == 1'b1) begin
-          input_pipeline_phase0[0] <= input_register;
-          input_pipeline_phase0[1] <= input_pipeline_phase0[0];
-        end
+
+      if (active_d1 == 1'b1) begin
+        product <= coeff_d1 * pre_adder;
      end
-    end // Delay_Pipeline_Phase0_process

-
-  always @( posedge clk or posedge reset)
-    begin: Delay_Pipeline_Phase1_process
-      if (reset == 1'b1) begin
-        input_pipeline_phase1[0] <= 0;
-        input_pipeline_phase1[1] <= 0;
-        input_pipeline_phase1[2] <= 0;
+      if (reset == 1'b1 || ready == 1'b1) begin
+        _sum <= 'h00;
+      end else if (active_d2 == 1'b1) begin
+        _sum <= _sum + product;
      end
-      else begin
-        if (phase_0 == 1'b1) begin
-          input_pipeline_phase1[0] <= input_register;
-          input_pipeline_phase1[1] <= input_pipeline_phase1[0];
-          input_pipeline_phase1[2] <= input_pipeline_phase1[1];
-        end
-      end
-    end // Delay_Pipeline_Phase1_process
+    end
+  end
+  endgenerate

+  always @(posedge clk) begin
+    if (reset == 1'b1) begin
+      ce_out <= 1'b0;
+    end else begin
+      ce_out <= ready;
+    end
+  end

-  assign product_phase0_1 = input_register * coeffphase1_1;
+  always @(posedge clk) begin
+    if (ready == 1'b1) begin
+      filter_out <= sum;
+    end
+  end

-  assign product_phase0_2 = input_pipeline_phase0[0] * coeffphase1_2;
-
-  assign product_phase0_3 = input_pipeline_phase0[1] * coeffphase1_3;
-
-  assign product_phase1_1 = input_pipeline_phase1[0] * coeffphase2_1;
-
-  assign product_phase1_2 = input_pipeline_phase1[1] * coeffphase2_2;
-
-  assign product_phase1_3 = input_pipeline_phase1[2] * coeffphase2_3;
-
-  always @ (posedge clk or posedge reset)
-    begin: product_pipeline_process1
-      if (reset == 1'b1) begin
-        product_pipeline_phase0_1 <= 0;
-        product_pipeline_phase1_1 <= 0;
-        product_pipeline_phase0_2 <= 0;
-        product_pipeline_phase1_2 <= 0;
-        product_pipeline_phase0_3 <= 0;
-        product_pipeline_phase1_3 <= 0;
-      end
-      else begin
-        if (phase_1 == 1'b1) begin
-          product_pipeline_phase0_1 <= product_phase0_1;
-          product_pipeline_phase1_1 <= product_phase1_1;
-          product_pipeline_phase0_2 <= product_phase0_2;
-          product_pipeline_phase1_2 <= product_phase1_2;
-          product_pipeline_phase0_3 <= product_phase0_3;
-          product_pipeline_phase1_3 <= product_phase1_3;
-        end
-      end
-    end // product_pipeline_process1
-
-  assign add_signext = product_pipeline_phase1_1;
-  assign add_signext_1 = product_pipeline_phase1_2;
-  assign add_temp = add_signext + add_signext_1;
-  assign sumvector1[0] = $signed({{1{add_temp[24]}}, add_temp});
-
-  assign add_signext_2 = product_pipeline_phase1_3;
-  assign add_signext_3 = product_pipeline_phase0_1;
-  assign add_temp_1 = add_signext_2 + add_signext_3;
-  assign sumvector1[1] = $signed({{1{add_temp_1[24]}}, add_temp_1});
-
-  assign add_signext_4 = product_pipeline_phase0_2;
-  assign add_signext_5 = product_pipeline_phase0_3;
-  assign add_temp_2 = add_signext_4 + add_signext_5;
-  assign sumvector1[2] = $signed({{1{add_temp_2[24]}}, add_temp_2});
-
-  always @ (posedge clk or posedge reset)
-    begin: sumdelay_pipeline_process1
-      if (reset == 1'b1) begin
-        sumdelay_pipeline1[0] <= 0;
-        sumdelay_pipeline1[1] <= 0;
-        sumdelay_pipeline1[2] <= 0;
-      end
-      else begin
-        if (phase_1 == 1'b1) begin
-          sumdelay_pipeline1[0] <= sumvector1[0];
-          sumdelay_pipeline1[1] <= sumvector1[1];
-          sumdelay_pipeline1[2] <= sumvector1[2];
-        end
-      end
-    end // sumdelay_pipeline_process1
-
-  assign add_signext_6 = sumdelay_pipeline1[0];
-  assign add_signext_7 = sumdelay_pipeline1[1];
-  assign add_temp_3 = add_signext_6 + add_signext_7;
-  assign sumvector2[0] = add_temp_3[25:0];
-
-  assign sumvector2[1] = sumdelay_pipeline1[2];
-
-  always @ (posedge clk or posedge reset)
-    begin: sumdelay_pipeline_process2
-      if (reset == 1'b1) begin
-        sumdelay_pipeline2[0] <= 0;
-        sumdelay_pipeline2[1] <= 0;
-      end
-      else begin
-        if (phase_1 == 1'b1) begin
-          sumdelay_pipeline2[0] <= sumvector2[0];
-          sumdelay_pipeline2[1] <= sumvector2[1];
-        end
-      end
-    end // sumdelay_pipeline_process2
-
-  assign add_signext_8 = sumdelay_pipeline2[0];
-  assign add_signext_9 = sumdelay_pipeline2[1];
-  assign add_temp_4 = add_signext_8 + add_signext_9;
-  assign sum3 = add_temp_4[25:0];
-
-  always @ (posedge clk or posedge reset)
-    begin: ce_delay
-      if (reset == 1'b1) begin
-        ce_delayline1 <= 1'b0;
-        ce_delayline2 <= 1'b0;
-        ce_delayline3 <= 1'b0;
-        ce_delayline4 <= 1'b0;
-        ce_delayline5 <= 1'b0;
-        ce_delayline6 <= 1'b0;
-        ce_delayline7 <= 1'b0;
-        ce_delayline8 <= 1'b0;
-      end
-      else begin
-        if (clk_enable == 1'b1) begin
-          ce_delayline1 <= clk_enable;
-          ce_delayline2 <= ce_delayline1;
-          ce_delayline3 <= ce_delayline2;
-          ce_delayline4 <= ce_delayline3;
-          ce_delayline5 <= ce_delayline4;
-          ce_delayline6 <= ce_delayline5;
-          ce_delayline7 <= ce_delayline6;
-          ce_delayline8 <= ce_delayline7;
-        end
-      end
-    end // ce_delay
-
-  assign ce_gated =  ce_delayline8 & ce_out_reg;
-
-  always @ (posedge clk or posedge reset)
-    begin: output_register_process
-      if (reset == 1'b1) begin
-        output_register <= 0;
-      end
-      else begin
-        if (phase_1 == 1'b1) begin
-          output_register <= sum3;
-        end
-      end
-    end // output_register_process
-
-  // Assignment Statements
-  assign ce_out = ce_gated;
-  assign filter_out = output_register;
-endmodule  // fir_decim
+endmodule