blob: 72f8ec08a5df5362e39673cfa6b0edbbfcecc83b [file] [log] [blame]
/* ****************************************************************************
-- (C) Copyright 2018 Kevin M. Hubbard - All rights reserved.
-- Source file: hyper_xface.v
-- Date: April 2018
-- Author: khubbard
-- Language: Verilog-2001
-- Simulation: Mentor-Modelsim
-- Synthesis: Xilinst-XST
-- License: This project is licensed with the CERN Open Hardware Licence
-- v1.2. You may redistribute and modify this project under the
-- terms of the CERN OHL v.1.2. (http://ohwr.org/cernohl).
-- This project is distributed WITHOUT ANY EXPRESS OR IMPLIED
-- WARRANTY, INCLUDING OF MERCHANTABILITY, SATISFACTORY QUALITY
-- AND FITNESS FOR A PARTICULAR PURPOSE. Please see the CERN OHL
-- v.1.2 for applicable Conditions.
-- Description: S27KL0641DABHI020 : Cypress IC DRAM 64MBIT 3V 100MHZ 24BGA
-- This is a dword interface module to HyperRAM for writing
-- and reading DWORDs. It is optimized for RTL portability and
-- simplicity rather than absolute bandwidth. The DRAM clock is
-- Div-4 of the core FPGA fabric clock in order to achieve reqd
-- 90o phase clock and data relationship per HyperRAM spec w/o
-- using an FPGA PLL. Latency is also max 2cyc ( about 12 RAM
-- clocks ) as to not require programming different values to the
-- control register defaults ( 2x latency with 166 MHz ).
-- Power On Reset : Part requires 150uS after Power On or Reset
--
-- Write Cycle:
-- clk /\/\/\
-- wr_req / \__
-- addr < >--
-- wr_d < >--
-- wr_byte_en <F>--
-- busy __/ \_
-- 1 2 3 ... . 14 15 16
-- dram_ck __/ \___/ \___/ \___/ \_ \___/ \___/ \___/ \__
-- dram_cs_l \___________________________________________________________/
-- dram_dq <A1><A2><A3><A4><A5><A6>--------------------<11><22><33><44>-
--
-- Read Cycle:
-- clk /\/\/\
-- rd_req / \___
-- addr < >---
-- num_dwrds < >---
-- busy __/ \_
-- rd_d -----------------------------------------------------------<>-
-- rd_rdy ___________________________________________________________/\_
-- 1 2 3 ... 14 15 16
-- dram_ck __/ \___/ \___/ \___/ \_ ___/ \___/ \___/ \__
-- dram_cs_l \____________________________________________________________/
-- dram_dq <A1><A2><A3><A4><A5><A6>---------------------<11><22><33><44>-
-- dram_rwds _/ \_____________________/ \___/ \____
--
-- Write Bursts:
-- Writes may be bursted in groups of 32bits by asserting wr_req with new data
-- every 8 clock cycles once burst_wr_rdy has asserted. Note Addr is ignored
-- wr_req / \___________________/ \__________/ \______________________
-- addr <A>---------------------------------------------------------
-- wr_d <B>-------------------<C>----------<D>----------------------
-- wr_byte_en <F>-------------------<F>----------<F>----------------------
-- |--| 5 clocks max
-- burst_wr_rdy _________________/ \_________/ \_________/ \_______________
-- dram_cs_l \______________________________________________________/
-- dram_dq --<A><A><A><A><A><A><B><B><B><B><C><C><C><C><D><D><D><D>---
--
-- Core Interface Description:
-- clk : in : FPGA clock. Actual DRAM clock will be this Div-4.
-- rd_req : in : When core not busy, assert 1ck to make read request.
-- wr_req : in : When core not busy, assert 1ck to make write request.
-- mem_or_req : in : 0=DRAM Memory. 1=Configuration Register
-- wr_byte_en : in : 0xF=Write all 4 bytes. 0xE write Bytes 3-1 but not 0.
-- rd_num_dwords : in : Number of dwords to read, example 0x01.
-- addr : in : 32bit byte (not dword) address for 64Mbit DRAM cell.
-- wr_d : in : 32bit Write Data to DRAM.
-- rd_d : out : 32bit Read Data from DRAM.
-- rd_rdy : out : Read Ready Strobe. Asserts 1ck when rd_d is valid.
-- busy : out : Busy Strobe asserts when Read or Write cycle is busy.
-- burst_wr_rdy : out : Asserts when ready to accept next wr_req for burst.
--
--
-- Example Setup: HyperRAM requires that both the DRAM and the Controller
-- agree to a fixed latency. The FPGA controller is configured via the
-- latency_1x and latency_2x input ports. The DRAM is configured via a
-- write to configuration register 0. The default setting is really slow
-- but has the advantage of not requiring a special configuration cycle
-- at the beginning of time. The difference is about 2x, for example 8 vs
-- 16 DRAM clocks for a single DWORD xfer. Default always uses 2x latency,
-- ignoring the rwds completely.
-- Default 6 Clock 166 MHz Latency, latency1x=0x12, latency2x=0x16
-- CfgReg0 write(0x00000800, 0x8f1f0000);
-- Configd 3 Clock 83 MHz Latency, latency1x=0x04, latency2x=0x0a
-- CfgReg0 write(0x00000800, 0x8fe40000);
-- ***************************************************************************/
`default_nettype none // Strictly enforce all nets to be declared
`timescale 1 ps/1 ps
module hyper_xface
(
input wire reset,
input wire clk,
input wire rd_req,
input wire wr_req,
input wire mem_or_reg,
input wire [3:0] wr_byte_en,
input wire rd_burst_en,
input wire [31:0] addr,
input wire [31:0] wr_d,
output reg [31:0] rd_d,
output reg rd_rdy,
output reg busy,
output reg burst_wr_rdy,
input wire [7:0] latency_1x,
input wire [7:0] latency_2x,
input wire [7:0] dram_dq_in,
output reg [7:0] dram_dq_out,
output reg dram_dq_oe_l,
input wire dram_rwds_in,
output reg dram_rwds_out,
output reg dram_rwds_oe_l,
output reg dram_ck,
output wire dram_rst_l,
output wire dram_cs_l,
output wire [7:0] sump_dbg
);// module hyper_xface
reg [47:0] addr_sr;
reg [31:0] data_sr;
reg [31:0] rd_sr;
reg [1:0] ck_phs;
reg [2:0] fsm_addr;
reg [3:0] fsm_data;
reg [5:0] fsm_wait;
reg run_rd_jk;
reg run_jk;
reg [3:0] run_jk_sr;
reg go_bit;
reg rw_bit;
reg reg_bit;
reg rwds_in_loc;
reg rwds_in_loc_p1;
reg byte_wr_en;
reg [7:0] sr_data;
reg [3:0] sr_byte_en;
reg [7:0] dram_rd_d;
reg addr_shift;
reg data_shift;
reg wait_shift;
reg cs_loc;
reg cs_l_reg;
reg dram_ck_loc;
reg rd_done;
reg [3:0] rd_cnt;
reg [2:0] rd_fsm;
reg rd_burst;
reg sample_now;
reg burst_wr_jk;
reg burst_wr_jk_clr;
reg [4:0] burst_wr_sr;
reg [35:0] burst_wr_d;
reg dram_dq_oe;
reg dram_rwds_oe;
assign dram_rst_l = ~ reset;
// Notes gleaned from datasheet:
// The clock is not required to be free-running.
//
// Note: RWDS and DQ are edge aligned
// During write transactions, data is center aligned with clock transitions.
//
// During write data transfers, RWDS is 1 to mask a data byte write.
//
// During read data transfers, RWDS is a read data strobe with data values
// edge aligned with the transitions of RWDS.
//
// The HyperRAM device may stop RWDS transitions with RWDS LOW, between the
// delivery of words, in order to insert latency between words when crossing
// memory array boundaries.
//
//
// Read 1x Latency
// |--------- 1x Latency ---------|
// CS_L \__________________________________________________________________/
// CK ____/ \___/ \___/ \___/ \___/ \___/ \___/ \___/ \_
// RWDS \____________________________________________________/ \___/ \___
// dir < input
// DQ[7:0] -<47><39><31><23><15><7 >---------------------------< >< >< >< >
// dir < output >---------------------------< input >
//
// Read 2x Latency
// |---1x Latency--|---2x Latency--|
// CS_L \__________________________________________________________________/
// CK ____/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \
// RWDS \______________________________/\__/\___
// DQ[7:0] ---<><><><><><>------------------------------<><><><>-----------
// dir < output >------------------------------<input >-----------
//
// Mem Write 1x Latency
// |---1x Latency--|---2x Latency--|
// CS_L \__________________________________________________________________/
// CK ____/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \
// RWDS __________/ \___________
// DQ[7:0] ---<><><><><><>------------<><><><>-----------
// dir < output >-<output>-----------
//
// Reg Write
//
// CS_L \__________________________________________________________________/
// CK ____/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \_/ \
// RWDS _____________________________
// DQ[7:0] ---<><><><><><><><>---------------------------
// dir < output >--------------------
//
// Command-Address Bit Packing:
// 47 R/W# Identifies the transaction as a read or write.
// R/W#=1 indicates a Read transaction
// R/W#=0 indicates a Write transaction
// 46 Address Space
// AS=0 indicates memory space
// AS=1 indicates the register space
// 45 Burst Type
// Indicates whether the burst will be linear or wrapped.
// Burst Type=0 indicates wrapped burst
// Burst Type=1 indicates linear burst
// 44-16 Row & Upper Column Address
// 15-3 Reserved for future column address expansion.
// 2-0 Lower Column Address
//
// Register Address Map CA[39:0] to A[31:0] mapping
// This module reduces 48bit address down to a 32bit address. This table
// records what the 32bit addresses are for the 4 registers + default values
// 0x000000 0000 : ID-Reg0 0x00000000 : 0x0c81
// 0x000000 0001 : ID-Reg1 0x00000001 : 0x0000
// 0x000100 0000 : Cfg-Reg0 0x00000800 : 0x8f1f
// 0x000100 0001 : Cfg-Reg1 0x00000801 : 0x0002
//-----------------------------------------------------------------------------
// DRAM clock is core clock over 4. This is to support the requirement of
// placing both clock edges on center of the DDR data. A full bandwidth design
// would require fancy PLL phase shifting which falls beyond the scope of this
// portable RTL only interface project.
//-----------------------------------------------------------------------------
always @ ( posedge clk ) begin : proc_ck
begin
if ( run_jk == 1 ) begin
ck_phs <= ck_phs + 1;
end else begin
ck_phs <= 2'd0;
end
end
end
//-----------------------------------------------------------------------------
// Shift Registers for 48bits of Ctrl+Addr and 32bits of Write Data
//-----------------------------------------------------------------------------
always @ ( posedge clk ) begin : proc_lb_regs
begin
rd_d <= 32'd0;
rd_rdy <= 0;
go_bit <= 0;
busy <= run_jk | go_bit;
if ( addr_shift == 1 ) begin
addr_sr[47:0] <= { addr_sr[39:0], 8'd0 };
end
if ( data_shift == 1 ) begin
data_sr[31:0] <= { data_sr[23:0], 8'd0 };
sr_byte_en[3:0] <= { sr_byte_en[2:0], 1'b0 };
end
if ( burst_wr_jk_clr == 1 ) begin
data_sr[31:0] <= burst_wr_d[31:0];
sr_byte_en[3:0] <= burst_wr_d[35:32];
end
if ( run_jk == 0 && ( wr_req == 1 || rd_req == 1 ) ) begin
burst_wr_jk <= 0;
busy <= 1;
go_bit <= 1;// Kick off the FSM
sr_byte_en <= wr_byte_en[3:0];
rw_bit <= rd_req; // 0=WriteOp, 1=ReadOp
reg_bit <= mem_or_reg; // 0=MemSpace,1=RegSpace
addr_sr[47] <= rd_req; // 0=WriteOp, 1=ReadOp
addr_sr[46] <= mem_or_reg;// 0=MemSpace,1=ReadSpace
addr_sr[45] <= 1'b1;// Linear Burst
addr_sr[15:3] <= 13'd0;
if ( mem_or_reg == 0 ) begin
addr_sr[44:16] <= addr[30:2];
addr_sr[2:0] <= { addr[1:0], 1'b0 };// Always getting DWORD
end else begin
addr_sr[44:16] <= addr[31:3];
addr_sr[2:0] <= addr[2:0];// Reg access needs 16bit LSB bit
end
data_sr[31:0] <= wr_d[31:0];
end
if ( burst_wr_jk_clr == 1 ) begin
burst_wr_jk <= 0;
end
if ( run_jk == 1 && wr_req == 1 && burst_wr_sr[4:0] != 5'd0 ) begin
burst_wr_jk <= 1;
burst_wr_d[31:0] <= wr_d[31:0];
burst_wr_d[35:32] <= wr_byte_en[3:0];
end
if ( rd_done == 1 ) begin
rd_d <= rd_sr[31:0];
rd_rdy <= 1;
end
if ( reset == 1 ) begin
go_bit <= 0;
burst_wr_jk <= 0;
end
end
end // proc_lb_regs
//-----------------------------------------------------------------------------
// 3 State Machines:
// fsm_addr : Counts the Address Cycles
// fsm_wait : Counts the Latency Cycles
// fsm_data : Counts the Data Cycles
//-----------------------------------------------------------------------------
always @ ( posedge clk ) begin : proc_fsm
begin
addr_shift <= 0;
data_shift <= 0;
wait_shift <= 0;
burst_wr_jk_clr <= 0;
if ( ck_phs[0] == 1 ) begin
if ( fsm_addr != 3'd0 ) begin
dram_dq_oe <= 0; // D[7:0] is Output
dram_rwds_oe <= 1; // RWDS is Input
fsm_addr <= fsm_addr - 1;
if ( fsm_addr == 3'd1 ) begin
// Register Writes have zero latency
if ( reg_bit == 1 && rw_bit == 0 ) begin
fsm_wait <= 6'd0;
fsm_data <= 4'd2;
end else begin
// Mem Writes Sample RWDS to determine 1 or 2 latency periods
// fsm_wait positions write data at appropriate place in time.
if ( rwds_in_loc == 0 ) begin
fsm_wait <= latency_1x[5:0];
end else begin
fsm_wait <= latency_2x[5:0];
end
end
if ( rw_bit == 1 ) begin
fsm_wait <= 6'd63;// This actually ends from RWDS strobing
run_rd_jk <= 1;
end
end else begin
fsm_wait <= 6'd0;
fsm_data <= 4'd0;
end
sr_data <= addr_sr[47:40];
addr_shift <= 1;
end
if ( fsm_wait != 6'd0 ) begin
byte_wr_en <= 0;
wait_shift <= 1;
fsm_wait <= fsm_wait - 1;
if ( fsm_wait == 6'd1 ) begin
fsm_data <= 4'd4;// Number of Bytes to Write
end
// sr_data <= { 2'd0, fsm_wait[5:0] };// Marker for when Latency is wrong
end
if ( fsm_data != 4'd0 ) begin
fsm_data <= fsm_data - 1;
sr_data <= data_sr[31:24];
byte_wr_en <= sr_byte_en[3];
data_shift <= 1;
if ( fsm_data == 4'd1 ) begin
run_jk <= 0;
if ( burst_wr_jk == 1 ) begin
run_jk <= 1;
burst_wr_jk_clr <= 1;
fsm_data <= 4'd4;// Number of Bytes to Write
end
end
end
if ( fsm_wait != 6'd0 || fsm_data != 4'd0 ) begin
if ( rw_bit == 1 ) begin
dram_dq_oe <= 1; // Input for Reads
dram_rwds_oe <= 1; // Input for Reads
end else begin
dram_dq_oe <= 0; // Output for Writes
dram_rwds_oe <= 0; // Output for Writes
end
end
end // if ( ck_phs[0] == 1 ) begin
if ( rd_done == 1 ) begin
if ( !rd_burst ) begin
run_jk <= 0;
run_rd_jk <= 0;
fsm_wait <= 6'd0;
end else begin
//rd_dwords_cnt <= rd_dwords_cnt - 1;
fsm_wait <= 6'd63;// This actually ends from RWDS strobing
end
end
if ( go_bit == 1 ) begin
fsm_addr <= 3'd6;
fsm_wait <= 6'd0;
fsm_data <= 4'd0;
run_jk <= 1;
dram_dq_oe <= 1; // Default Input
dram_rwds_oe <= 1; // Default Input
end
run_jk_sr <= { run_jk_sr[2:0], run_jk };
if ( run_jk == 1 ) begin
cs_loc <= 1;
end else if ( run_jk_sr[1:0] == 2'd0 ) begin
cs_loc <= 0;
dram_dq_oe <= 1; // Default Input
dram_rwds_oe <= 1; // Default Input
end
if ( reset == 1 ) begin
fsm_addr <= 3'd0;
fsm_data <= 4'd0;
fsm_wait <= 6'd0;
run_jk <= 0;
run_rd_jk <= 0;
byte_wr_en <= 0;
cs_loc <= 0;
end
burst_wr_rdy <= 0;
if ( fsm_data == 4'd4 && burst_wr_rdy == 0 ) begin
burst_wr_rdy <= 1;
end
// Protection against wr_req coming in too late. There is a 5 clock window
burst_wr_sr[4:0] <= { burst_wr_sr[3:0], burst_wr_rdy };
end
end // proc_fsm
//-----------------------------------------------------------------------------
// Read SR
// clk /\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\
// 0 1 2 3 0 1 2
// CK ___/ \___/ \___/ \___/ \___/ \___/ \___/ \___/ \_
// RWDS \___________________________________________________/ \___/ \___
// dir < input
// DQ[7:0]-<47><39><31><23><15><7 >---------------------------< >< >< >< >
//-----------------------------------------------------------------------------
always @ ( posedge clk ) begin : proc_rd_sr
begin
rwds_in_loc_p1 <= rwds_in_loc;
rd_done <= 0;
sample_now <= 0;
if ( run_rd_jk == 0 ) begin
rd_fsm <= 3'd4;
rd_cnt <= 4'd0;
end else begin
if ( rd_fsm == 3'd4 ) begin
if ( rwds_in_loc == 1 && rwds_in_loc_p1 == 0 ) begin
rd_fsm <= 3'd0;
sample_now <= 1;
end
end else begin
rd_fsm <= rd_fsm + 1;
if ( rd_fsm == 3'd1 ) begin
rd_fsm <= 3'd4;
sample_now <= 1;
end
end
end
if ( sample_now == 1 ) begin
rd_sr[31:0] <= { rd_sr[23:0], dram_rd_d[7:0] };
rd_cnt <= rd_cnt + 1;
if ( rd_cnt == 4'd3 ) begin
rd_done <= 1;// Call it a day after 4 bytes
rd_cnt <= 4'd0;
rd_burst <= rd_burst_en;
end
end
end
end // proc_rd_sr
// Pipe out some signals for bebugging using SUMP
assign sump_dbg[0] = busy;
assign sump_dbg[1] = run_rd_jk;
assign sump_dbg[2] = sample_now;
assign sump_dbg[3] = rd_done;
assign sump_dbg[7:4] = rd_cnt[3:0];
//-----------------------------------------------------------------------------
// IO Flops
//-----------------------------------------------------------------------------
always @ ( posedge clk ) begin : proc_out
begin
dram_ck_loc <= ck_phs[1];
dram_ck <= dram_ck_loc;
rwds_in_loc <= dram_rwds_in;
dram_rd_d <= dram_dq_in[7:0];
dram_dq_out <= sr_data[7:0];
dram_rwds_out <= ~ byte_wr_en;// Note: rwds is a mask, 1==Don't Write Byte
cs_l_reg <= ~ cs_loc;
dram_dq_oe_l <= dram_dq_oe;
dram_rwds_oe_l <= dram_rwds_oe;
if ( reset == 1 ) begin
cs_l_reg <= 1;
end
end
end // proc_out
assign dram_cs_l = cs_l_reg;
endmodule // hyper_xface.v