DCN

基本信息

DCN 系列分两版,以下信息基于 DCNv1(原始论文);DCNv2 见正文 ## DCNv2 节。

字段	内容
标题	Deep & Cross Network for Ad Click Predictions
作者	Ruoxi Wang, Bin Fu, Gang Fu, Mingliang Wang
机构	Stanford University; Google
年份	2017 (AdKDD’17)
方向	Explicit Feature Crossing, Cross Network, Wide-Deep Combination
场景	广告 CTR 预估中的显式 + 隐式特征交叉
arXiv	DCNv1 https://arxiv.org/abs/1708.05123 · DCNv2 https://arxiv.org/abs/2008.13535

显式 + 自动的特征交叉

DNN：隐式特征交叉

LR：大量依赖人工特征工程

DCN = Deep Network + Cross Network

DCN先后推出了DCNv1 和 DCNv2两个版本

DCNv1

https://arxiv.org/abs/1708.05123

特征交叉发生在x0和xl^T的矩阵乘法上

参数量(dL2)

计算成本通过结合率可以从O(d^2)降到O(d)

缺点：

权重矩阵参数量小，模型能力有限
受限于x0的特定形式，每层的x是x0的标量倍，表达能力受限
是拼接后的bit-wise交互，可能会破坏每个feature原始的含义，相比vector-wise可能效果要差一些

import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossNetwork(nn.Module):
    """
    Cross Network 核心层
    公式: x_{l+1} = x_0 * (x_l^T * w_l) + b_l + x_l
    """
    def __init__(self, input_dim, num_layers):
        super(CrossNetwork, self).__init__()
        self.num_layers = num_layers
        
        # 定义每一层的权重 w 和偏置 b
        # w 的维度是 (input_dim, 1)，对应公式中的 w_l
        self.kernels = nn.ParameterList(
            [nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(input_dim, 1))) 
             for _ in range(num_layers)]
        )
        self.biases = nn.ParameterList(
            [nn.Parameter(torch.nn.init.zeros_(torch.empty(input_dim, 1))) 
             for _ in range(num_layers)]
        )

    def forward(self, x_0):
        """
        x_0: 输入张量, shape (batch_size, input_dim)
        """
        # 将输入扩展一维以便进行矩阵运算: (B, D) -> (B, D, 1)
        x_0 = x_0.unsqueeze(2)
        x_l = x_0 # 初始状态
        
        for i in range(self.num_layers):
            # 1. 计算 feature crossing: x_l^T * w_l
            # x_l: (B, D, 1), w_l: (D, 1)
            # tensordot 在维度 1 (D) 上进行收缩 -> 结果 shape: (B, 1, 1)
            xl_w = torch.tensordot(x_l, self.kernels[i], dims=([1], [0]))
            
            # 2. 乘以 x_0: x_0 * (scalar)
            # (B, D, 1) * (B, 1, 1) -> (B, D, 1) (广播机制)
            dot_ = torch.matmul(x_0, xl_w)
            
            # 3. 加上偏置和残差连接: + b_l + x_l
            x_l = dot_ + self.biases[i] + x_l
            
        # 压缩回 (B, D)
        return x_l.squeeze(2)

class DCN(nn.Module):
    """
    Deep & Cross Network 完整模型
    """
    def __init__(self, feat_sizes, embedding_size, 
                 dense_feature_dim, cross_num_layers=2, 
                 dnn_hidden_units=[128, 64], dnn_dropout=0.0):
        """
        feat_sizes: dict, 稀疏特征的 {feature_name: vocabulary_size}
        embedding_size: 稀疏特征 Embedding 维度
        dense_feature_dim: 稠密特征的维度和
        """
        super(DCN, self).__init__()
        
        # 1. Embedding 层 (用于处理稀疏特征)
        self.embeddings = nn.ModuleDict({
            feat: nn.Embedding(vocab, embedding_size) 
            for feat, vocab in feat_sizes.items()
        })
        
        # 计算拼接后的输入总维度 Input Dim
        # Total Dim = (Num Sparse Feats * Emb Size) + Dense Feature Dim
        self.input_dim = len(feat_sizes) * embedding_size + dense_feature_dim
        
        # 2. Cross Network 部分
        self.cross_net = CrossNetwork(self.input_dim, cross_num_layers)
        
        # 3. Deep Network 部分 (MLP)
        dnn_layers = []
        input_act = self.input_dim
        for hidden_unit in dnn_hidden_units:
            dnn_layers.append(nn.Linear(input_act, hidden_unit))
            dnn_layers.append(nn.ReLU())
            dnn_layers.append(nn.Dropout(dnn_dropout))
            input_act = hidden_unit
        self.dnn = nn.Sequential(*dnn_layers)
        
        # 4. Combination Layer (输出层)
        # 将 CrossNet 输出和 DNN 输出拼接
        final_dim = self.input_dim + dnn_hidden_units[-1]
        self.final_linear = nn.Linear(final_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x_sparse, x_dense):
        """
        x_sparse: dict, key为特征名, value为 (B, ) 的 LongTensor
        x_dense: (B, dense_feature_dim) 的 FloatTensor
        """
        # A. 处理 Embedding
        sparse_embeds = [self.embeddings[feat](x_sparse[feat]) for feat in x_sparse]
        sparse_embeds = torch.cat(sparse_embeds, dim=-1) # (B, num_sparse * emb_dim)
        
        # B. 拼接 Embedding 和 Dense 特征 -> x_0
        x_0 = torch.cat([sparse_embeds, x_dense], dim=-1) # (B, input_dim)
        
        # C. Cross Network 前向传播
        cross_out = self.cross_net(x_0)
        
        # D. Deep Network 前向传播
        deep_out = self.dnn(x_0)
        
        # E. Stacking & Output
        stack_out = torch.cat([cross_out, deep_out], dim=-1)
        logits = self.final_linear(stack_out)
        
        return self.sigmoid(logits)
if __name__ == "__main__":
    # === 1. 模拟超参数 ===
    BATCH_SIZE = 8
    EMBEDDING_SIZE = 4
    DENSE_DIM = 5
    # 定义3个稀疏特征及其词表大小
    FEAT_SIZES = {"feat_user": 100, "feat_item": 50, "feat_category": 20}
    
    # === 2. 构造虚拟数据 ===
    # 稀疏特征输入 (Batch, )
    dummy_sparse = {
        "feat_user": torch.randint(0, 100, (BATCH_SIZE,)),
        "feat_item": torch.randint(0, 50, (BATCH_SIZE,)),
        "feat_category": torch.randint(0, 20, (BATCH_SIZE,))
    }
    # 稠密特征输入 (Batch, Dense_Dim)
    dummy_dense = torch.randn(BATCH_SIZE, DENSE_DIM)
    
    # === 3. 初始化模型 ===
    model = DCN(
        feat_sizes=FEAT_SIZES,
        embedding_size=EMBEDDING_SIZE,
        dense_feature_dim=DENSE_DIM,
        cross_num_layers=3,       # 3层 Cross Net
        dnn_hidden_units=[32, 16] # Deep Net 结构
    )
    
    print(f"Model Structure:\n{model}")
    
    # === 4. 前向传播测试 ===
    try:
        output = model(dummy_sparse, dummy_dense)
        print("\n=== Forward Pass Successful ===")
        print(f"Input Dense Shape: {dummy_dense.shape}")
        print(f"Combined Input Dim: {len(FEAT_SIZES) * EMBEDDING_SIZE + DENSE_DIM}") # 3*4 + 5 = 17
        print(f"Output Shape: {output.shape}") # 应为 (8, 1)
        print(f"Prediction Examples: \n{output.detach().view(-1)}")
        
        # 简单反向传播测试
        loss = output.mean()
        loss.backward()
        print("Backward Pass Successful (Gradient Computed).")
        
    except Exception as e:
        print(f"Error: {e}")

DCNv2

https://arxiv.org/abs/2008.13535

主要改动：

将DCNv1的权重向量替换为了权重矩阵，提高模型Cross侧能力
将权重矩阵用低秩矩阵分解做近似，减轻计算量
为了弥补低秩矩阵近似带来的表达能力损失，DCNv2采用Gated MoE结构（就是MMoE中的专家层替换为CrossNetwork）
提供两种Deep+Cross的方式：串行/并行

Gated MoE实现：

import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossNetV2(nn.Module):
    """
    DCN V2 核心模块 (Matrix Kernel)
    公式: x_{l+1} = x_0 \odot (W_l x_l + b_l) + x_l
    """
    def __init__(self, input_dim, num_layers):
        super(CrossNetV2, self).__init__()
        self.num_layers = num_layers
        
        # 定义每一层的权重矩阵 W 和偏置 b
        # W: (input_dim, input_dim) 全秩矩阵
        self.kernels = nn.ParameterList(
            [nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(input_dim, input_dim))) 
             for _ in range(num_layers)]
        )
        self.biases = nn.ParameterList(
            [nn.Parameter(torch.nn.init.zeros_(torch.empty(input_dim, 1))) 
             for _ in range(num_layers)]
        )

    def forward(self, x_0):
        """
        x_0: 输入 Tensor, shape (batch_size, input_dim)
        """
        # 扩展维度以便进行矩阵运算: (B, D) -> (B, D, 1)
        x_0 = x_0.unsqueeze(2) 
        x_l = x_0 # 初始状态
        
        for i in range(self.num_layers):
            # 1. 线性变换: W_l * x_l
            # W: (D, D), x_l: (B, D, 1) -> (B, D, 1)
            # 这里使用 matmul，PyTorch 会自动广播 W
            linear_trans = torch.matmul(self.kernels[i], x_l) 
            
            # 2. 加上偏置: + b_l
            linear_trans = linear_trans + self.biases[i]
            
            # 3. 特征交叉 (Hadamard Product): x_0 \odot (...)
            # 逐元素相乘
            interaction = x_0 * linear_trans
            
            # 4. 残差连接: + x_l
            x_l = interaction + x_l
            
        # 压缩回 (B, D)
        return x_l.squeeze(2)

class DCNv2(nn.Module):
    """
    DCN V2 完整模型 (Parallel 结构示例)
    结构: Embedding -> (CrossNetV2 || DeepNet) -> Concat -> Output
    """
    def __init__(self, feat_sizes, embedding_size, 
                 dense_feature_dim, cross_layers=2, 
                 dnn_hidden_units=[128, 64], dnn_dropout=0.0):
        super(DCNv2, self).__init__()
        
        # 1. Embedding 层
        self.embeddings = nn.ModuleDict({
            feat: nn.Embedding(vocab, embedding_size) 
            for feat, vocab in feat_sizes.items()
        })
        
        # 计算总输入维度
        self.input_dim = len(feat_sizes) * embedding_size + dense_feature_dim
        
        # 2. Cross Network V2
        self.cross_net = CrossNetV2(self.input_dim, cross_layers)
        
        # 3. Deep Network (DNN)
        dnn_layers = []
        input_act = self.input_dim
        for hidden_unit in dnn_hidden_units:
            dnn_layers.append(nn.Linear(input_act, hidden_unit))
            dnn_layers.append(nn.ReLU())
            dnn_layers.append(nn.Dropout(dnn_dropout))
            input_act = hidden_unit
        self.dnn = nn.Sequential(*dnn_layers)
        
        # 4. Combination Layer (Parallel 结构: 拼接 Cross 和 Deep 的输出)
        # 输出维度 = Input_Dim (CrossNet输出保持维度不变) + DNN最后一层维度
        final_dim = self.input_dim + dnn_hidden_units[-1]
        self.final_linear = nn.Linear(final_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x_sparse, x_dense):
        # A. 处理 Embedding
        sparse_embeds = [self.embeddings[feat](x_sparse[feat]) for feat in x_sparse]
        if sparse_embeds:
            sparse_embeds = torch.cat(sparse_embeds, dim=-1) 
            x_0 = torch.cat([sparse_embeds, x_dense], dim=-1)
        else:
            x_0 = x_dense

        # B. Cross Network 前向传播
        cross_out = self.cross_net(x_0)
        
        # C. Deep Network 前向传播
        deep_out = self.dnn(x_0)
        
        # D. 拼接与输出 (Parallel Structure)
        stack_out = torch.cat([cross_out, deep_out], dim=-1)
        logits = self.final_linear(stack_out)
        
        return self.sigmoid(logits)
if __name__ == "__main__":
    print("=== DCN V2 测试 ===")
    
    # 1. 模拟超参数
    BATCH_SIZE = 4
    EMB_DIM = 8
    DENSE_DIM = 5
    # 3个稀疏特征
    FEAT_SIZES = {"user_id": 100, "item_id": 50, "cate_id": 20}
    
    # 2. 构造虚拟数据
    dummy_sparse = {
        "user_id": torch.randint(0, 100, (BATCH_SIZE,)),
        "item_id": torch.randint(0, 50, (BATCH_SIZE,)),
        "cate_id": torch.randint(0, 20, (BATCH_SIZE,))
    }
    dummy_dense = torch.randn(BATCH_SIZE, DENSE_DIM)
    
    # 3. 实例化模型
    model = DCNv2(
        feat_sizes=FEAT_SIZES, 
        embedding_size=EMB_DIM,
        dense_feature_dim=DENSE_DIM,
        cross_layers=3,         # 3层交叉
        dnn_hidden_units=[64, 32]
    )
    
    print(f"模型总输入维度: {3 * EMB_DIM + DENSE_DIM}") # 3*8 + 5 = 29
    
    # 4. 前向传播
    try:
        output = model(dummy_sparse, dummy_dense)
        print(f"Output Shape: {output.shape}") # 应为 (4, 1)
        print(f"Predictions: {output.detach().view(-1)}")
        
        # 5. 反向传播检查
        loss = output.mean()
        loss.backward()
        
        # 检查 CrossNet 第一层权重的梯度
        grad = model.cross_net.kernels[0].grad
        if grad is not None:
            print(f"CrossNet Layer 0 Gradient Norm: {grad.norm().item():.4f}")
            print("反向传播成功！")
        else:
            print("反向传播失败：无梯度")
            
    except Exception as e:
        print(f"运行出错: {e}")