DCN

基本信息

DCN 系列分两版,以下信息基于 DCNv1(原始论文);DCNv2 见正文 ## DCNv2 节。

字段 内容
标题 Deep & Cross Network for Ad Click Predictions
作者 Ruoxi Wang, Bin Fu, Gang Fu, Mingliang Wang
机构 Stanford University; Google
年份 2017 (AdKDD’17)
方向 Explicit Feature Crossing, Cross Network, Wide-Deep Combination
场景 广告 CTR 预估中的显式 + 隐式特征交叉
arXiv DCNv1 https://arxiv.org/abs/1708.05123 · DCNv2 https://arxiv.org/abs/2008.13535

显式 + 自动 的 特征交叉

DNN:隐式特征交叉

LR:大量依赖人工特征工程

DCN = Deep Network + Cross Network

DCN先后推出了DCNv1 和 DCNv2两个版本

DCNv1

https://arxiv.org/abs/1708.05123

img

img

特征交叉发生在x0和xl^T的矩阵乘法上

参数量(dL2)

计算成本通过结合率可以从O(d^2)降到O(d)

缺点:

  1. 权重矩阵参数量小,模型能力有限
  2. 受限于x0的特定形式,每层的x是x0的标量倍,表达能力受限
  3. 是拼接后的bit-wise交互,可能会破坏每个feature原始的含义,相比vector-wise可能效果要差一些
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossNetwork(nn.Module):
"""
Cross Network 核心层
公式: x_{l+1} = x_0 * (x_l^T * w_l) + b_l + x_l
"""
def __init__(self, input_dim, num_layers):
super(CrossNetwork, self).__init__()
self.num_layers = num_layers

# 定义每一层的权重 w 和偏置 b
# w 的维度是 (input_dim, 1),对应公式中的 w_l
self.kernels = nn.ParameterList(
[nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(input_dim, 1)))
for _ in range(num_layers)]
)
self.biases = nn.ParameterList(
[nn.Parameter(torch.nn.init.zeros_(torch.empty(input_dim, 1)))
for _ in range(num_layers)]
)

def forward(self, x_0):
"""
x_0: 输入张量, shape (batch_size, input_dim)
"""
# 将输入扩展一维以便进行矩阵运算: (B, D) -> (B, D, 1)
x_0 = x_0.unsqueeze(2)
x_l = x_0 # 初始状态

for i in range(self.num_layers):
# 1. 计算 feature crossing: x_l^T * w_l
# x_l: (B, D, 1), w_l: (D, 1)
# tensordot 在维度 1 (D) 上进行收缩 -> 结果 shape: (B, 1, 1)
xl_w = torch.tensordot(x_l, self.kernels[i], dims=([1], [0]))

# 2. 乘以 x_0: x_0 * (scalar)
# (B, D, 1) * (B, 1, 1) -> (B, D, 1) (广播机制)
dot_ = torch.matmul(x_0, xl_w)

# 3. 加上偏置和残差连接: + b_l + x_l
x_l = dot_ + self.biases[i] + x_l

# 压缩回 (B, D)
return x_l.squeeze(2)

class DCN(nn.Module):
"""
Deep & Cross Network 完整模型
"""
def __init__(self, feat_sizes, embedding_size,
dense_feature_dim, cross_num_layers=2,
dnn_hidden_units=[128, 64], dnn_dropout=0.0):
"""
feat_sizes: dict, 稀疏特征的 {feature_name: vocabulary_size}
embedding_size: 稀疏特征 Embedding 维度
dense_feature_dim: 稠密特征的维度和
"""
super(DCN, self).__init__()

# 1. Embedding 层 (用于处理稀疏特征)
self.embeddings = nn.ModuleDict({
feat: nn.Embedding(vocab, embedding_size)
for feat, vocab in feat_sizes.items()
})

# 计算拼接后的输入总维度 Input Dim
# Total Dim = (Num Sparse Feats * Emb Size) + Dense Feature Dim
self.input_dim = len(feat_sizes) * embedding_size + dense_feature_dim

# 2. Cross Network 部分
self.cross_net = CrossNetwork(self.input_dim, cross_num_layers)

# 3. Deep Network 部分 (MLP)
dnn_layers = []
input_act = self.input_dim
for hidden_unit in dnn_hidden_units:
dnn_layers.append(nn.Linear(input_act, hidden_unit))
dnn_layers.append(nn.ReLU())
dnn_layers.append(nn.Dropout(dnn_dropout))
input_act = hidden_unit
self.dnn = nn.Sequential(*dnn_layers)

# 4. Combination Layer (输出层)
# 将 CrossNet 输出和 DNN 输出拼接
final_dim = self.input_dim + dnn_hidden_units[-1]
self.final_linear = nn.Linear(final_dim, 1)
self.sigmoid = nn.Sigmoid()

def forward(self, x_sparse, x_dense):
"""
x_sparse: dict, key为特征名, value为 (B, ) 的 LongTensor
x_dense: (B, dense_feature_dim) 的 FloatTensor
"""
# A. 处理 Embedding
sparse_embeds = [self.embeddings[feat](x_sparse[feat]) for feat in x_sparse]
sparse_embeds = torch.cat(sparse_embeds, dim=-1) # (B, num_sparse * emb_dim)

# B. 拼接 Embedding 和 Dense 特征 -> x_0
x_0 = torch.cat([sparse_embeds, x_dense], dim=-1) # (B, input_dim)

# C. Cross Network 前向传播
cross_out = self.cross_net(x_0)

# D. Deep Network 前向传播
deep_out = self.dnn(x_0)

# E. Stacking & Output
stack_out = torch.cat([cross_out, deep_out], dim=-1)
logits = self.final_linear(stack_out)

return self.sigmoid(logits)
if __name__ == "__main__":
# === 1. 模拟超参数 ===
BATCH_SIZE = 8
EMBEDDING_SIZE = 4
DENSE_DIM = 5
# 定义3个稀疏特征及其词表大小
FEAT_SIZES = {"feat_user": 100, "feat_item": 50, "feat_category": 20}

# === 2. 构造虚拟数据 ===
# 稀疏特征输入 (Batch, )
dummy_sparse = {
"feat_user": torch.randint(0, 100, (BATCH_SIZE,)),
"feat_item": torch.randint(0, 50, (BATCH_SIZE,)),
"feat_category": torch.randint(0, 20, (BATCH_SIZE,))
}
# 稠密特征输入 (Batch, Dense_Dim)
dummy_dense = torch.randn(BATCH_SIZE, DENSE_DIM)

# === 3. 初始化模型 ===
model = DCN(
feat_sizes=FEAT_SIZES,
embedding_size=EMBEDDING_SIZE,
dense_feature_dim=DENSE_DIM,
cross_num_layers=3, # 3层 Cross Net
dnn_hidden_units=[32, 16] # Deep Net 结构
)

print(f"Model Structure:\n{model}")

# === 4. 前向传播测试 ===
try:
output = model(dummy_sparse, dummy_dense)
print("\n=== Forward Pass Successful ===")
print(f"Input Dense Shape: {dummy_dense.shape}")
print(f"Combined Input Dim: {len(FEAT_SIZES) * EMBEDDING_SIZE + DENSE_DIM}") # 3*4 + 5 = 17
print(f"Output Shape: {output.shape}") # 应为 (8, 1)
print(f"Prediction Examples: \n{output.detach().view(-1)}")

# 简单反向传播测试
loss = output.mean()
loss.backward()
print("Backward Pass Successful (Gradient Computed).")

except Exception as e:
print(f"Error: {e}")

DCNv2

https://arxiv.org/abs/2008.13535

主要改动:

  1. 将DCNv1的权重向量替换为了权重矩阵,提高模型Cross侧能力
  2. 将权重矩阵用低秩矩阵分解做近似,减轻计算量
  3. 为了弥补低秩矩阵近似带来的表达能力损失,DCNv2采用Gated MoE结构(就是MMoE中的专家层替换为CrossNetwork)
  4. 提供两种Deep+Cross的方式:串行/并行

Gated MoE实现:

img

img

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossNetV2(nn.Module):
"""
DCN V2 核心模块 (Matrix Kernel)
公式: x_{l+1} = x_0 \odot (W_l x_l + b_l) + x_l
"""
def __init__(self, input_dim, num_layers):
super(CrossNetV2, self).__init__()
self.num_layers = num_layers

# 定义每一层的权重矩阵 W 和偏置 b
# W: (input_dim, input_dim) 全秩矩阵
self.kernels = nn.ParameterList(
[nn.Parameter(torch.nn.init.xavier_normal_(torch.empty(input_dim, input_dim)))
for _ in range(num_layers)]
)
self.biases = nn.ParameterList(
[nn.Parameter(torch.nn.init.zeros_(torch.empty(input_dim, 1)))
for _ in range(num_layers)]
)

def forward(self, x_0):
"""
x_0: 输入 Tensor, shape (batch_size, input_dim)
"""
# 扩展维度以便进行矩阵运算: (B, D) -> (B, D, 1)
x_0 = x_0.unsqueeze(2)
x_l = x_0 # 初始状态

for i in range(self.num_layers):
# 1. 线性变换: W_l * x_l
# W: (D, D), x_l: (B, D, 1) -> (B, D, 1)
# 这里使用 matmul,PyTorch 会自动广播 W
linear_trans = torch.matmul(self.kernels[i], x_l)

# 2. 加上偏置: + b_l
linear_trans = linear_trans + self.biases[i]

# 3. 特征交叉 (Hadamard Product): x_0 \odot (...)
# 逐元素相乘
interaction = x_0 * linear_trans

# 4. 残差连接: + x_l
x_l = interaction + x_l

# 压缩回 (B, D)
return x_l.squeeze(2)

class DCNv2(nn.Module):
"""
DCN V2 完整模型 (Parallel 结构示例)
结构: Embedding -> (CrossNetV2 || DeepNet) -> Concat -> Output
"""
def __init__(self, feat_sizes, embedding_size,
dense_feature_dim, cross_layers=2,
dnn_hidden_units=[128, 64], dnn_dropout=0.0):
super(DCNv2, self).__init__()

# 1. Embedding 层
self.embeddings = nn.ModuleDict({
feat: nn.Embedding(vocab, embedding_size)
for feat, vocab in feat_sizes.items()
})

# 计算总输入维度
self.input_dim = len(feat_sizes) * embedding_size + dense_feature_dim

# 2. Cross Network V2
self.cross_net = CrossNetV2(self.input_dim, cross_layers)

# 3. Deep Network (DNN)
dnn_layers = []
input_act = self.input_dim
for hidden_unit in dnn_hidden_units:
dnn_layers.append(nn.Linear(input_act, hidden_unit))
dnn_layers.append(nn.ReLU())
dnn_layers.append(nn.Dropout(dnn_dropout))
input_act = hidden_unit
self.dnn = nn.Sequential(*dnn_layers)

# 4. Combination Layer (Parallel 结构: 拼接 Cross 和 Deep 的输出)
# 输出维度 = Input_Dim (CrossNet输出保持维度不变) + DNN最后一层维度
final_dim = self.input_dim + dnn_hidden_units[-1]
self.final_linear = nn.Linear(final_dim, 1)
self.sigmoid = nn.Sigmoid()

def forward(self, x_sparse, x_dense):
# A. 处理 Embedding
sparse_embeds = [self.embeddings[feat](x_sparse[feat]) for feat in x_sparse]
if sparse_embeds:
sparse_embeds = torch.cat(sparse_embeds, dim=-1)
x_0 = torch.cat([sparse_embeds, x_dense], dim=-1)
else:
x_0 = x_dense

# B. Cross Network 前向传播
cross_out = self.cross_net(x_0)

# C. Deep Network 前向传播
deep_out = self.dnn(x_0)

# D. 拼接与输出 (Parallel Structure)
stack_out = torch.cat([cross_out, deep_out], dim=-1)
logits = self.final_linear(stack_out)

return self.sigmoid(logits)
if __name__ == "__main__":
print("=== DCN V2 测试 ===")

# 1. 模拟超参数
BATCH_SIZE = 4
EMB_DIM = 8
DENSE_DIM = 5
# 3个稀疏特征
FEAT_SIZES = {"user_id": 100, "item_id": 50, "cate_id": 20}

# 2. 构造虚拟数据
dummy_sparse = {
"user_id": torch.randint(0, 100, (BATCH_SIZE,)),
"item_id": torch.randint(0, 50, (BATCH_SIZE,)),
"cate_id": torch.randint(0, 20, (BATCH_SIZE,))
}
dummy_dense = torch.randn(BATCH_SIZE, DENSE_DIM)

# 3. 实例化模型
model = DCNv2(
feat_sizes=FEAT_SIZES,
embedding_size=EMB_DIM,
dense_feature_dim=DENSE_DIM,
cross_layers=3, # 3层交叉
dnn_hidden_units=[64, 32]
)

print(f"模型总输入维度: {3 * EMB_DIM + DENSE_DIM}") # 3*8 + 5 = 29

# 4. 前向传播
try:
output = model(dummy_sparse, dummy_dense)
print(f"Output Shape: {output.shape}") # 应为 (4, 1)
print(f"Predictions: {output.detach().view(-1)}")

# 5. 反向传播检查
loss = output.mean()
loss.backward()

# 检查 CrossNet 第一层权重的梯度
grad = model.cross_net.kernels[0].grad
if grad is not None:
print(f"CrossNet Layer 0 Gradient Norm: {grad.norm().item():.4f}")
print("反向传播成功!")
else:
print("反向传播失败:无梯度")

except Exception as e:
print(f"运行出错: {e}")