Did you not take a look at my above attached code (2 posts up) with the 2 versions and 3 errors and work from there???
Send me a working Freebasic version for testing...
(Region 1&2 are your old 'Inv' function - green / red lines of the complex geoarc function...)
Ok, please scrutinize the crap out of this freebasic version. If it is good, I will make code it to verilog for you.
When I say scrutinize, I mean that the graphics is rendered perfectly.
Also, make sure the code looks good.
line 41: localparam int CYCLE_TIME_STAMP_COUNTER_WIDTH = $clog2(20'(int'(real'(CYCLE_TIME_STAMP_COUNTER_IDEAL) * 1.1))); // Account for 10% deviation in audio clock
module video_source(
// inputs
input clk, // 25MHz pixel clock
// outputs
output [2:0] TMDSp, // TMDS data out
output [2:0] TMDSn,
output TMDS_CLKp, // pixel clock out
output TMDS_CLKn
);
reg [9:0] x, y ; // horizontal & vertical pixel counters
reg hSync ;
reg vSync ;
reg D_EN ; // Display ENable
/* Create timing signals for a valid 640x480 display.
*
* This requires D_EN to be enabled when the raster is
* within the visible display area and for the raster
* counters (X and Y) to be updated and reset according
* to their position on the screen.
*
* hSync and vSync should also go high according to the
* specifications outlined for the 640x480 video mode.
*/
always @(posedge clk) begin
D_EN <= ( x < 640 ) && ( y < 480 ) ; // enable display if pixel counters are in visible display area
x <= ( x == 799 ) ? 0 : x + 1 ; // increment horizontal pixel counter, or reset if at end of line
if( x == 799 ) begin // horizontal pixel counter has reached end of row
if ( y == 524 ) begin
y <= 0 ; // reset vertical pixel counter
end else begin
y <= y + 1 ; // increment vertical pixel counter
end
end
hSync <= ( x >= 656 ) && ( x < 752 ) ; // hSync goes HIGH when horizontal pixel counter is between 655 and 752
vSync <= ( y >= 490 ) && ( y < 492 ) ; // vSync goes HIGH when vertical pixel counter is between 489 and 492
end
wire [7:0] W = {8{x[7:0]==y[7:0]}} ;
wire [7:0] A = {8{x[7:5]==3'h2 && y[7:5]==3'h2}} ;
reg [7:0] red ;
reg [7:0] green ;
reg [7:0] blue ;
// Create a display pattern
always @(posedge clk) begin
red <= ( { x[5:0] & { 6 { y[4:3] == ~x[4:3] } }, 2'b00 } | W ) & ~A ;
green <= ( x[7:0] & { 8{ y[6] } } | W ) & ~A ;
blue <= y[7:0] | W | A ;
end
//
// Create three TMDS_encoder instances to handle the Red, Green, Blue and Control signals
//
wire [9:0] TMDS_red ;
wire [9:0] TMDS_green ;
wire [9:0] TMDS_blue ;
TMDS_encoder encode_R(
.clk ( clk ),
.VD ( red ),
.CD ( 2'b00 ),
.VDE ( D_EN ),
.TMDS( TMDS_red )
);
TMDS_encoder encode_G(
.clk ( clk ),
.VD ( green ),
.CD ( 2'b00 ),
.VDE ( D_EN ),
.TMDS( TMDS_green )
);
TMDS_encoder encode_B(
.clk ( clk ),
.VD ( blue ),
.CD ( { vSync, hSync } ),
.VDE ( D_EN ),
.TMDS( TMDS_blue )
);
//
// Multiply 25MHz clock by 10 to generate a 250MHz clock
wire clk_TMDS ;
wire DCM_TMDS_CLKFX ; // 25MHz x 10 = 250MHz
DCM_SP #(.CLKFX_MULTIPLY(10)) DCM_TMDS_inst(.CLKIN(clk), .CLKFX(DCM_TMDS_CLKFX), .RST(1'b0) ) ;
BUFG BUFG_TMDSp(.I(DCM_TMDS_CLKFX), .O(clk_TMDS)) ;
//
// Create three 10-bit shift registers running at 250MHz
reg [3:0] TMDS_mod10 = 0 ; // modulus 10 counter
reg [9:0] TMDS_shift_red = 0 ;
reg [9:0] TMDS_shift_green = 0 ;
reg [9:0] TMDS_shift_blue = 0 ;
reg TMDS_shift_load = 0 ;
always @(posedge clk_TMDS) begin
TMDS_shift_load <= ( TMDS_mod10 == 4'd9 ) ;
TMDS_shift_red <= TMDS_shift_load ? TMDS_red : TMDS_shift_red [9:1] ;
TMDS_shift_green <= TMDS_shift_load ? TMDS_green : TMDS_shift_green[9:1] ;
TMDS_shift_blue <= TMDS_shift_load ? TMDS_blue : TMDS_shift_blue [9:1] ;
TMDS_mod10 <= ( TMDS_mod10 == 4'd9 ) ? 4'd0 : TMDS_mod10 + 4'd1 ;
end
OBUFDS OBUFDS_red ( .I( TMDS_shift_red [0] ), .O( TMDSp[2] ), .OB( TMDSn[2] ) ) ;
OBUFDS OBUFDS_green( .I( TMDS_shift_green[0] ), .O( TMDSp[1] ), .OB( TMDSn[1] ) ) ;
OBUFDS OBUFDS_blue ( .I( TMDS_shift_blue [0] ), .O( TMDSp[0] ), .OB( TMDSn[0] ) ) ;
OBUFDS OBUFDS_clock( .I( clk ), .O( TMDS_CLKp ), .OB( TMDS_CLKn ) ) ;
endmodule
//*********************************************************************************************************
//
// TMDS Encoder Module
//
//*********************************************************************************************************
module TMDS_encoder(
input clk,
input [7:0] VD, // video data (red, green or blue)
input [1:0] CD, // control data
input VDE, // video data enable, to choose between CD (when VDE=0) and VD (when VDE=1)
output reg [9:0] TMDS = 0
);
wire [3:0] Nb1s = VD[0] + VD[1] + VD[2] + VD[3] + VD[4] + VD[5] + VD[6] + VD[7] ;
wire XNOR = ( Nb1s > 4'd4 ) || ( Nb1s == 4'd4 && VD[0] == 1'b0 ) ;
wire [8:0] q_m = { ~XNOR, q_m[6:0] ^ VD[7:1] ^ { 7{ XNOR } }, VD[0] } ;
reg [3:0] balance_acc = 0 ;
wire [3:0] balance = q_m[0] + q_m[1] + q_m[2] + q_m[3] + q_m[4] + q_m[5] + q_m[6] + q_m[7] - 4'd4 ;
wire balance_sign_eq = ( balance[3] == balance_acc[3] ) ;
wire invert_q_m = ( balance == 0 || balance_acc == 0 ) ? ~q_m[8] : balance_sign_eq ;
wire [3:0] balance_acc_inc = balance - ( { q_m[8] ^ ~balance_sign_eq } & ~( balance == 0 || balance_acc == 0 ) ) ;
wire [3:0] balance_acc_new = invert_q_m ? balance_acc-balance_acc_inc : balance_acc + balance_acc_inc ;
wire [9:0] TMDS_data = { invert_q_m, q_m[8], q_m[7:0] ^ { 8{ invert_q_m } } } ;
wire [9:0] TMDS_code = CD[1] ? (CD[0] ? 10'b1010101011 : 10'b0101010100) : ( CD[0] ? 10'b0010101011 : 10'b1101010100 ) ;
always @(posedge clk) begin
TMDS <= VDE ? TMDS_data : TMDS_code ;
balance_acc <= VDE ? balance_acc_new : 4'h0 ;
end
endmodule
... but then I never expect to clone a github repo and it just 'work' straight away. They normally take more time to get working than it would take to build the thing from scratch, but I was hoping that I might be able to save some time and also use a more powerful interface (the inclusion of audio in the output is very desirable).
I think this is what you want?
Looks like another issue with the 2nd region - the red line jumps off on a tangent when the ellipse becomes very wide.
The reason there are so many code examples for Xilinx FPGA is because it's very trivial to implement HDMI on that platform.
OBUFDS - Output BUFfer with Differential Signalling
DCM - Digital Clock Manager (think of it as advanced PLL with additional functionality)
BUFG - global clock buffer (entry into low-skew lines designed to distribute clock signals across the die)
The code you've posted does not use SERDES, which is fine for lower frequencies, but won't work for things like 720p, which run at 742.5 MHz.
The reason there are so many code examples for Xilinx FPGA is because it's very trivial to implement HDMI on that platform.
OBUFDS - Output BUFfer with Differential Signalling
DCM - Digital Clock Manager (think of it as advanced PLL with additional functionality)
BUFG - global clock buffer (entry into low-skew lines designed to distribute clock signals across the die)
The code you've posted does not use SERDES, which is fine for lower frequencies, but won't work for things like 720p, which run at 742.5 MHz.
Yep. Out of curiosity, do you think there is any possibility that SERDES could be inferred from pure HDL (maybe with a particular coding style), or if it's never going to be inferred and you need to explicitely instantiate that?
Yep. Out of curiosity, do you think there is any possibility that SERDES could be inferred from pure HDL (maybe with a particular coding style), or if it's never going to be inferred and you need to explicitely instantiate that?
I've done some simulations using various dimensions for the ellipse - have attached some typical variations, and have tested permutations of X and Y from 0-40 pixels (and other sizes without checking ALL the pixels) - all are pixel-perfect in the simulation output compared with the expected output provided by the FreeBasic code.
I can't get it to break.
When sub_function == 3
p <= (alu_mult_y + 2) >> 2 ;
When sub_function == 6
p <= p + ry2 - alu_mult_y ;
When sub_function == 7 && (px <= py) && (p <= 0)
p <= p + ry2 + (px + (ry2<<1)) ;
When sub_function == 7 && (px <= py) && !(p <= 0)
p <= p + ry2 + (px + (ry2<<1)) - (py - (rx2<<1)) ;
p <= ( p * (sub_function !=3)) +
(( (((alu_mult_y + 2) >> 2) * sub_function == 3) )) -
(( ((alu_mult_y) * sub_function == 6) )) +
(( ry2 * ( sub_function == 6 || (sub_function == 7 && (px <= py)) ) )) +
(( (px + (ry2<<1)) * ((sub_function == 7) && (px <= py)) )) -
(( (py - (rx2<<1)) * ((sub_function == 7) && (px <= py) && !(p <= 0)) )) ;
sub_function = 4 bits
(px<=px) = 32+32 bits
(p<=0) = 32 bits
'p' = 32 bits
alu_mult_y = 32bits *2=64 (shifted and non shifted)
rx2,ry2,px2,py3 = 32*4bits =128
TOTAL: 324 bits / 324 wires/signals to generate the result 'p'.
p <= ( p * (sub_function !=3)) +
(( ry2 + (px + (ry2<<1))) * ((sub_function == 7) && (px <= py)) ) -
(( (py - (rx2<<1)) * ((sub_function == 7) && (px <= py) && !(p <= 0)) )) ;
p <= (alu_mult_y + 2) >> 2 ;
p <= p + ry2 - alu_mult_y ;
And not add any complexity/dependancies to the above test 132MHz FMAX equation.Tutorial for Nockieboy in improving FMAX. Part 1.
...
Below, I'm showing you how the compiler constructs the logic for calculating 'p' (approximately). Remember, the FPGA is not a CPU passing memory variable to and from a single ALU, all the above instructions need to be combined into a single set of gates to make the 32 bit register 'p' equal the following function at the core clock of 125MHz.
(Yes, I tried to get this right, so analyze it...)Code: [Select]p <= ( p * (sub_function !=3)) +
(( (((alu_mult_y + 2) >> 2) * sub_function == 3) )) -
(( ((alu_mult_y) * sub_function == 6) )) +
(( ry2 * ( sub_function == 6 || (sub_function == 7 && (px <= py)) ) )) +
(( (px + (ry2<<1)) * ((sub_function == 7) && (px <= py)) )) -
(( (py - (rx2<<1)) * ((sub_function == 7) && (px <= py) && !(p <= 0)) )) ;
YES, all that shit... Though, the compiler will simplify the algebra as much as possible, this is the mess that 32 bit register 'p' must equal with all those other variables being 32 bits which feed a mass of gates to compute for the D-flipflop 32 bit data input. Apparently, the necessary entire mass of gates will fail to guarantee the correct solution when register 'p' is clocked (with everything else of course) above 117MHz.
if (sub_function == 4)
p <= p + ry2 ;
if (sub_function == 5)
p <= p + ry2 ;
if (sub_function == 6)
p <= p - alu_mult_y ;
if (sub_function == 7) && (px <= py) && (p <= 0)
p <= p + ry2 + (px + (ry2<<1)) ;
if (sub_function == 7) && (px <= py) && !(p <= 0)
p <= p + ry2 + (px + (ry2<<1)) - (py - (rx2<<1)) ;
p <= p + ( ( ( sub_function > 3 ) && ( sub_function != 6 ) ) * ry2 )
- ( ( sub_function == 6 ) * alu_mult_y )
+ ( ( ( sub_function == 7 ) && ( px <= py ) ) * ( px + ( ry2<<1) ) )
- ( ( ( sub_function == 7 ) && ( px <= py ) && ( p > 0) ) * ( py - ( rx2<<1) ) )
'p' is dependent on the sub_function[3:0] number, (px <= py), !(p <= 0), plus the 32 bit registers 'p' itself since it is being added to itself, then alu_mult_y both added by 2 and shifted and again natively, rx2, ry2, px and py.Code: [Select]sub_function = 4 bits
(px<=px) = 32+32 bits
(p<=0) = 32 bits
'p' = 32 bits
alu_mult_y = 32bits *2=64 (shifted and non shifted)
rx2,ry2,px2,py3 = 32*4bits =128
TOTAL: 324 bits / 324 wires/signals to generate the result 'p'.
Test V9 attached spaghetti code.
Snapshots not necessary unless there are errors...
(I also found out I'm doing 2 sub_functions uselessly identically twice, the correction will be done next.)
p <= p + ( ry2 << 1 ) ;
if (sub_function == 4)
p <= p + ry2 ;
if (sub_function == 5)
p <= p + ry2 - alu_mult_y ;
OR, would it be better to do this instead and remove step 6 entirely by merging it into step 5? EDIT: Have just realised - the alu takes 2 clocks for its result to be valid, so this may not be a valid solution.Code: [Select]if (sub_function == 4)
p <= p + ry2 ;
if (sub_function == 5)
p <= p + ry2 - alu_mult_y ;
That would remove one step from sub_function but also simplify the logic for the entire system, as this:
p <= p + ( ( ( sub_function > 3 ) && ( sub_function != 6 ) ) * ry2 )
- ( ( sub_function == 6 ) * alu_mult_y )
+ ( ( ( sub_function == 7 ) && ( px <= py ) ) * ( px + ( ry2<<1) ) )
- ( ( ( sub_function == 7 ) && ( px <= py ) && ( p > 0) ) * ( py - ( rx2<<1) ) )
...could become this:
p <= p + ( ( sub_function > 3 ) * ry2 ) <<-- simplifies this line by one dependency
- ( ( sub_function == 5 ) * alu_mult_y )
+ ( ( ( sub_function == 6 ) && ( px <= py ) ) * ( px + ( ry2<<1) ) )
- ( ( ( sub_function == 6 ) && ( px <= py ) && ( p > 0) ) * ( py - ( rx2<<1) ) )