axi_hdmi: Let the tools assign the csc resources

Write code to pipeline data path for better DSP utilization on the
color space conversion.
In the old method the addition operations were performed outside the
DSPs
main
AndreiGrozav 2018-09-13 16:26:51 +03:00 committed by AndreiGrozav
parent 47f7894881
commit 265781f29a
6 changed files with 96 additions and 321 deletions

View File

@ -5,9 +5,7 @@
LIBRARY_NAME := axi_hdmi_tx
GENERIC_DEPS += ../common/ad_csc_1.v
GENERIC_DEPS += ../common/ad_csc_1_add.v
GENERIC_DEPS += ../common/ad_csc_1_mul.v
GENERIC_DEPS += ../common/ad_csc.v
GENERIC_DEPS += ../common/ad_csc_RGB2CrYCb.v
GENERIC_DEPS += ../common/ad_mem.v
GENERIC_DEPS += ../common/ad_rst.v

View File

@ -7,9 +7,7 @@ adi_ip_create axi_hdmi_tx
adi_ip_files axi_hdmi_tx [list \
"$ad_hdl_dir/library/common/ad_mem.v" \
"$ad_hdl_dir/library/common/ad_rst.v" \
"$ad_hdl_dir/library/common/ad_csc_1_mul.v" \
"$ad_hdl_dir/library/common/ad_csc_1_add.v" \
"$ad_hdl_dir/library/common/ad_csc_1.v" \
"$ad_hdl_dir/library/common/ad_csc.v" \
"$ad_hdl_dir/library/common/ad_csc_RGB2CrYCb.v" \
"$ad_hdl_dir/library/common/ad_ss_444to422.v" \
"$ad_hdl_dir/library/common/up_axi.v" \

View File

@ -36,78 +36,92 @@
`timescale 1ns/100ps
module ad_csc_1 #(
module ad_csc #(
parameter DELAY_DATA_WIDTH = 16) (
parameter DELAY_DW = 16,
parameter COLOR_N = 1) (
// data
input clk,
input [DW:0] sync,
input [23:0] data,
input clk,
input [DELAY_DW-1:0] sync,
input [ 23:0] data,
// constants
input [16:0] C1,
input [16:0] C2,
input [16:0] C3,
input [24:0] C4,
input signed [16:0] C1,
input signed [16:0] C2,
input signed [16:0] C3,
input signed [24:0] C4,
// sync is delay matched
output [DW:0] csc_sync_1,
output [ 7:0] csc_data_1);
output reg [DELAY_DW-1:0] csc_sync,
output [ 7:0] csc_data);
localparam DW = DELAY_DATA_WIDTH - 1;
localparam Y = 1;
localparam Cb = 2;
localparam Cr = 3;
// internal wires
wire [24:0] data_1_m_s;
wire [24:0] data_2_m_s;
wire [24:0] data_3_m_s;
wire [DW:0] sync_3_m_s;
reg [ 23:0] data_d1;
reg [ 23:0] data_d2;
reg [ 33:0] data_1;
reg [ 33:0] data_2;
reg [ 33:0] data_3;
reg [DELAY_DW:0] sync_1_m;
reg [DELAY_DW:0] sync_2_m;
reg [DELAY_DW:0] sync_3_m;
reg [ 33:0] s_data_1;
reg [ 33:0] s_data_2;
reg [ 33:0] s_data_3;
// c1*R
ad_csc_1_mul #(.DELAY_DATA_WIDTH(1)) i_mul_c1 (
.clk (clk),
.data_a (C1),
.data_b (data[23:16]),
.data_p (data_1_m_s),
.ddata_in (1'd0),
.ddata_out ());
wire signed [33:0] data_1_s;
wire signed [33:0] data_2_s;
wire signed [33:0] data_3_s;
// c2*G
ad_csc_1_mul #(.DELAY_DATA_WIDTH(1)) i_mul_c2 (
.clk (clk),
.data_a (C2),
.data_b (data[15:8]),
.data_p (data_2_m_s),
.ddata_in (1'd0),
.ddata_out ());
// Let the tools decide what logic to infer
// c3*B
always @(posedge clk) begin
data_d1 <= data;
data_d2 <= data_d1;
data_1 <= {9'd0, data[23:16]} * C1; // R
data_2 <= {9'd0, data_d1[15: 8]} * C2; // G
data_3 <= {9'd0, data_d2[ 7: 0]} * C3; // B
sync_1_m <= sync;
end
ad_csc_1_mul #(.DELAY_DATA_WIDTH(DELAY_DATA_WIDTH)) i_mul_c3 (
.clk (clk),
.data_a (C3),
.data_b (data[7:0]),
.data_p (data_3_m_s),
.ddata_in (sync),
.ddata_out (sync_3_m_s));
generate
if (COLOR_N == Y) begin
assign data_1_s = data_1;
assign data_2_s = data_2;
assign data_3_s = data_3;
end
if (COLOR_N == Cb) begin
assign data_1_s = ~data_1;
assign data_2_s = ~data_2;
assign data_3_s = data_3;
end
if (COLOR_N == Cr) begin
assign data_1_s = data_1;
assign data_2_s = ~data_2;
assign data_3_s = ~data_3;
end
endgenerate
// sum + c4
always @(posedge clk) begin
s_data_1 <= data_1_s + C4;
s_data_2 <= s_data_1 + data_2_s;
s_data_3 <= s_data_2 + data_3_s;
sync_2_m <= sync_1_m;
sync_3_m <= sync_2_m;
csc_sync <= sync_3_m;
end
ad_csc_1_add #(.DELAY_DATA_WIDTH(DELAY_DATA_WIDTH)) i_add_c4 (
.clk (clk),
.data_1 (data_1_m_s),
.data_2 (data_2_m_s),
.data_3 (data_3_m_s),
.data_4 (C4),
.data_p (csc_data_1),
.ddata_in (sync_3_m_s),
.ddata_out (csc_sync_1));
assign csc_data = s_data_3[23:16];
endmodule

View File

@ -1,147 +0,0 @@
// ***************************************************************************
// ***************************************************************************
// Copyright 2014 - 2017 (c) Analog Devices, Inc. All rights reserved.
//
// In this HDL repository, there are many different and unique modules, consisting
// of various HDL (Verilog or VHDL) components. The individual modules are
// developed independently, and may be accompanied by separate and unique license
// terms.
//
// The user should read each of these license terms, and understand the
// freedoms and responsibilities that he or she has by using this source/core.
//
// This core is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE.
//
// Redistribution and use of source or resulting binaries, with or without modification
// of this file, are permitted under one of the following two license terms:
//
// 1. The GNU General Public License version 2 as published by the
// Free Software Foundation, which can be found in the top level directory
// of this repository (LICENSE_GPL2), and also online at:
// <https://www.gnu.org/licenses/old-licenses/gpl-2.0.html>
//
// OR
//
// 2. An ADI specific BSD license, which can be found in the top level directory
// of this repository (LICENSE_ADIBSD), and also on-line at:
// https://github.com/analogdevicesinc/hdl/blob/master/LICENSE_ADIBSD
// This will allow to generate bit files and not release the source code,
// as long as it attaches to an ADI device.
//
// ***************************************************************************
// ***************************************************************************
// Color Space Conversion, adder. This is a simple adder, but had to be
// pipe-lined for faster clock rates. The delay input is delay-matched to
// the sum pipe-line stages
`timescale 1ps/1ps
module ad_csc_1_add #(
parameter DELAY_DATA_WIDTH = 16) (
// all signed
input clk,
input [24:0] data_1,
input [24:0] data_2,
input [24:0] data_3,
input [24:0] data_4,
output reg [ 7:0] data_p,
// delay match
input [DW:0] ddata_in,
output reg [DW:0] ddata_out);
localparam DW = DELAY_DATA_WIDTH - 1;
// internal registers
reg [DW:0] p1_ddata = 'd0;
reg [24:0] p1_data_1 = 'd0;
reg [24:0] p1_data_2 = 'd0;
reg [24:0] p1_data_3 = 'd0;
reg [24:0] p1_data_4 = 'd0;
reg [DW:0] p2_ddata = 'd0;
reg [24:0] p2_data_0 = 'd0;
reg [24:0] p2_data_1 = 'd0;
reg [DW:0] p3_ddata = 'd0;
reg [24:0] p3_data = 'd0;
// internal signals
wire [24:0] p1_data_1_p_s;
wire [24:0] p1_data_1_n_s;
wire [24:0] p1_data_1_s;
wire [24:0] p1_data_2_p_s;
wire [24:0] p1_data_2_n_s;
wire [24:0] p1_data_2_s;
wire [24:0] p1_data_3_p_s;
wire [24:0] p1_data_3_n_s;
wire [24:0] p1_data_3_s;
wire [24:0] p1_data_4_p_s;
wire [24:0] p1_data_4_n_s;
wire [24:0] p1_data_4_s;
// pipe line stage 1, get the two's complement versions
assign p1_data_1_p_s = {1'b0, data_1[23:0]};
assign p1_data_1_n_s = ~p1_data_1_p_s + 1'b1;
assign p1_data_1_s = (data_1[24] == 1'b1) ? p1_data_1_n_s : p1_data_1_p_s;
assign p1_data_2_p_s = {1'b0, data_2[23:0]};
assign p1_data_2_n_s = ~p1_data_2_p_s + 1'b1;
assign p1_data_2_s = (data_2[24] == 1'b1) ? p1_data_2_n_s : p1_data_2_p_s;
assign p1_data_3_p_s = {1'b0, data_3[23:0]};
assign p1_data_3_n_s = ~p1_data_3_p_s + 1'b1;
assign p1_data_3_s = (data_3[24] == 1'b1) ? p1_data_3_n_s : p1_data_3_p_s;
assign p1_data_4_p_s = {1'b0, data_4[23:0]};
assign p1_data_4_n_s = ~p1_data_4_p_s + 1'b1;
assign p1_data_4_s = (data_4[24] == 1'b1) ? p1_data_4_n_s : p1_data_4_p_s;
always @(posedge clk) begin
p1_ddata <= ddata_in;
p1_data_1 <= p1_data_1_s;
p1_data_2 <= p1_data_2_s;
p1_data_3 <= p1_data_3_s;
p1_data_4 <= p1_data_4_s;
end
// pipe line stage 2, get the sum (intermediate, 4->2)
always @(posedge clk) begin
p2_ddata <= p1_ddata;
p2_data_0 <= p1_data_1 + p1_data_2;
p2_data_1 <= p1_data_3 + p1_data_4;
end
// pipe line stage 3, get the sum (final, 2->1)
always @(posedge clk) begin
p3_ddata <= p2_ddata;
p3_data <= p2_data_0 + p2_data_1;
end
// output registers, output is unsigned (0 if sum is < 0) and saturated.
// the inputs are expected to be 1.4.20 format (output is 8bits).
always @(posedge clk) begin
ddata_out <= p3_ddata;
if (p3_data[24] == 1'b1) begin
data_p <= 8'h00;
end else if (p3_data[23:20] == 'd0) begin
data_p <= p3_data[19:12];
end else begin
data_p <= 8'hff;
end
end
endmodule
// ***************************************************************************
// ***************************************************************************

View File

@ -1,97 +0,0 @@
// ***************************************************************************
// ***************************************************************************
// Copyright 2014 - 2017 (c) Analog Devices, Inc. All rights reserved.
//
// In this HDL repository, there are many different and unique modules, consisting
// of various HDL (Verilog or VHDL) components. The individual modules are
// developed independently, and may be accompanied by separate and unique license
// terms.
//
// The user should read each of these license terms, and understand the
// freedoms and responsibilities that he or she has by using this source/core.
//
// This core is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE.
//
// Redistribution and use of source or resulting binaries, with or without modification
// of this file, are permitted under one of the following two license terms:
//
// 1. The GNU General Public License version 2 as published by the
// Free Software Foundation, which can be found in the top level directory
// of this repository (LICENSE_GPL2), and also online at:
// <https://www.gnu.org/licenses/old-licenses/gpl-2.0.html>
//
// OR
//
// 2. An ADI specific BSD license, which can be found in the top level directory
// of this repository (LICENSE_ADIBSD), and also on-line at:
// https://github.com/analogdevicesinc/hdl/blob/master/LICENSE_ADIBSD
// This will allow to generate bit files and not release the source code,
// as long as it attaches to an ADI device.
//
// ***************************************************************************
// ***************************************************************************
// Color Space Conversion, multiplier. This is a simple partial product adder
// that generates the product of the two inputs.
`timescale 1ps/1ps
module ad_csc_1_mul #(
parameter DELAY_DATA_WIDTH = 16) (
// data_a is signed
input clk,
input [16:0] data_a,
input [ 7:0] data_b,
output [24:0] data_p,
// delay match
input [(DELAY_DATA_WIDTH-1):0] ddata_in,
output [(DELAY_DATA_WIDTH-1):0] ddata_out);
// internal registers
reg [(DELAY_DATA_WIDTH-1):0] p1_ddata = 'd0;
reg [(DELAY_DATA_WIDTH-1):0] p2_ddata = 'd0;
reg [(DELAY_DATA_WIDTH-1):0] p3_ddata = 'd0;
reg p1_sign = 'd0;
reg p2_sign = 'd0;
reg p3_sign = 'd0;
// internal signals
wire [33:0] p3_data_s;
// a/b reg, m-reg, p-reg delay match
always @(posedge clk) begin
p1_ddata <= ddata_in;
p2_ddata <= p1_ddata;
p3_ddata <= p2_ddata;
end
always @(posedge clk) begin
p1_sign <= data_a[16];
p2_sign <= p1_sign;
p3_sign <= p2_sign;
end
assign ddata_out = p3_ddata;
assign data_p = {p3_sign, p3_data_s[23:0]};
ad_mul ad_mul_1 (
.clk(clk),
.data_a({1'b0, data_a[15:0]}),
.data_b({9'b0, data_b}),
.data_p(p3_data_s),
.ddata_in(16'h0),
.ddata_out());
endmodule
// ***************************************************************************
// ***************************************************************************

View File

@ -60,42 +60,51 @@ module ad_csc_RGB2CrYCb #(
// Cr (red-diff)
ad_csc_1 #(.DELAY_DATA_WIDTH(DELAY_DATA_WIDTH)) i_csc_1_Cr (
ad_csc #(
.DELAY_DW(DELAY_DATA_WIDTH),
.COLOR_N(3))
j_csc_1_Cr (
.clk (clk),
.sync (RGB_sync),
.data (RGB_data),
.C1 (17'h00707),
.C2 (17'h105e2),
.C3 (17'h10124),
.C4 (25'h0080000),
.csc_sync_1 (CrYCb_sync),
.csc_data_1 (CrYCb_data[23:16]));
.C1 (17'h7070),
.C2 (17'h5e27),
.C3 (17'h1248),
.C4 (24'h800002),
.csc_sync (CrYCb_sync),
.csc_data (CrYCb_data[23:16]));
// Y (luma)
ad_csc_1 #(.DELAY_DATA_WIDTH(1)) i_csc_1_Y (
ad_csc #(
.DELAY_DW(0),
.COLOR_N(1))
j_csc_1_Y (
.clk (clk),
.sync (1'd0),
.data (RGB_data),
.C1 (17'h0041b),
.C2 (17'h00810),
.C3 (17'h00191),
.C4 (25'h0010000),
.csc_sync_1 (),
.csc_data_1 (CrYCb_data[15:8]));
.C1 (17'h041bd),
.C2 (17'h0810e),
.C3 (17'h01910),
.C4 (24'h100000),
.csc_sync (),
.csc_data (CrYCb_data[15:8]));
// Cb (blue-diff)
ad_csc_1 #(.DELAY_DATA_WIDTH(1)) i_csc_1_Cb (
ad_csc #(
.DELAY_DW(0),
.COLOR_N(2))
j_csc_1_Cb (
.clk (clk),
.sync (1'd0),
.data (RGB_data),
.C1 (17'h1025f),
.C2 (17'h104a7),
.C3 (17'h00707),
.C4 (25'h0080000),
.csc_sync_1 (),
.csc_data_1 (CrYCb_data[7:0]));
.C1 (17'h25f1),
.C2 (17'h4a7e),
.C3 (17'h7070),
.C4 (24'h800002),
.csc_sync (),
.csc_data (CrYCb_data[7:0]));
endmodule