我正在研究一个需要输入 6 张图像的模型(实际上它是来自视频的一系列图像)。CNN 也正在解决类似类型的问题,但两种模型都存在问题。
两种模型都在经历垂死的 RELU(RELU 让我失望了),这可以通过一个简单的 LeakyRELU 轻松修复。但是我需要在一个对我来说更复杂和新的视觉转换器中实现 LeakyRELU。如果有人可以帮助我建议我可以在哪里或如何在视觉转换器模型中实现 Leaky RELU。
def make_ViT_6():
input_0 = tf.keras.layers.Input(shape=(370, 370, 1))
input_1 = tf.keras.layers.Input(shape=(370, 370, 1))
input_2 = tf.keras.layers.Input(shape=(370, 370, 1))
input_3 = tf.keras.layers.Input(shape=(370, 370, 1))
input_4 = tf.keras.layers.Input(shape=(370, 370, 1))
input_5 = tf.keras.layers.Input(shape=(370, 370, 1))
vit_0 = Patches(patch_size)(input_0)
vit_0 = PatchEncoder(num_patches, projection_dim)(vit_0)
vit_1 = Patches(patch_size)(input_1)
vit_1 = PatchEncoder(num_patches, projection_dim)(vit_1)
vit_2 = Patches(patch_size)(input_2)
vit_2 = PatchEncoder(num_patches, projection_dim)(vit_2)
vit_3 = Patches(patch_size)(input_3)
vit_3 = PatchEncoder(num_patches, projection_dim)(vit_3)
vit_4 = Patches(patch_size)(input_4)
vit_4 = PatchEncoder(num_patches, projection_dim)(vit_4)
vit_5 = Patches(patch_size)(input_5)
vit_5 = PatchEncoder(num_patches, projection_dim)(vit_5)
for _ in range(transformer_layers):
x1_0 = layers.LayerNormalization(epsilon=1e-6)(vit_0)
x1_1 = layers.LayerNormalization(epsilon=1e-6)(vit_1)
x1_2 = layers.LayerNormalization(epsilon=1e-6)(vit_2)
x1_3 = layers.LayerNormalization(epsilon=1e-6)(vit_3)
x1_4 = layers.LayerNormalization(epsilon=1e-6)(vit_4)
x1_5 = layers.LayerNormalization(epsilon=1e-6)(vit_5)
attention_output_0 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=projection_dim, dropout=0.1
)(x1_0, x1_0)
attention_output_1 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=projection_dim, dropout=0.1
)(x1_1, x1_1)
attention_output_2 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=projection_dim, dropout=0.1
)(x1_2, x1_2)
attention_output_3 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=projection_dim, dropout=0.1
)(x1_3, x1_3)
attention_output_4 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=projection_dim, dropout=0.1
)(x1_4, x1_4)
attention_output_5 = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=projection_dim, dropout=0.1
)(x1_5, x1_5)
x2_0 = layers.Add()([attention_output_0, vit_0])
x2_1 = layers.Add()([attention_output_1, vit_1])
x2_2 = layers.Add()([attention_output_2, vit_2])
x2_3 = layers.Add()([attention_output_3, vit_3])
x2_4 = layers.Add()([attention_output_4, vit_4])
x2_5 = layers.Add()([attention_output_5, vit_5])
x3_0 = layers.LayerNormalization(epsilon=1e-6)(x2_0)
x3_1 = layers.LayerNormalization(epsilon=1e-6)(x2_1)
x3_2 = layers.LayerNormalization(epsilon=1e-6)(x2_2)
x3_3 = layers.LayerNormalization(epsilon=1e-6)(x2_3)
x3_4 = layers.LayerNormalization(epsilon=1e-6)(x2_4)
x3_5 = layers.LayerNormalization(epsilon=1e-6)(x2_5)
x3_0 = mlp(x3_0, hidden_units=transformer_units, dropout_rate=0.1)
x3_1 = mlp(x3_1, hidden_units=transformer_units, dropout_rate=0.1)
x3_2 = mlp(x3_2, hidden_units=transformer_units, dropout_rate=0.1)
x3_3 = mlp(x3_3, hidden_units=transformer_units, dropout_rate=0.1)
x3_4 = mlp(x3_4, hidden_units=transformer_units, dropout_rate=0.1)
x3_5 = mlp(x3_5, hidden_units=transformer_units, dropout_rate=0.1)
vit_0 = layers.Add()([x3_0, x2_0])
vit_1 = layers.Add()([x3_1, x2_1])
vit_2 = layers.Add()([x3_2, x2_2])
vit_3 = layers.Add()([x3_3, x2_3])
vit_4 = layers.Add()([x3_4, x2_4])
vit_5 = layers.Add()([x3_5, x2_5])
representation_0 = layers.LayerNormalization(epsilon=1e-6)(vit_0)
representation_0 = layers.Flatten()(representation_0)
representation_0 = layers.Dropout(0.5)(representation_0)
features_0 = mlp(representation_0, hidden_units=mlp_head_units, dropout_rate=0.5)
representation_1 = layers.LayerNormalization(epsilon=1e-6)(vit_1)
representation_1 = layers.Flatten()(representation_1)
representation_1 = layers.Dropout(0.5)(representation_1)
features_1 = mlp(representation_1, hidden_units=mlp_head_units, dropout_rate=0.5)
representation_2 = layers.LayerNormalization(epsilon=1e-6)(vit_2)
representation_2 = layers.Flatten()(representation_2)
representation_2 = layers.Dropout(0.5)(representation_2)
features_2 = mlp(representation_2, hidden_units=mlp_head_units, dropout_rate=0.5)
representation_3 = layers.LayerNormalization(epsilon=1e-6)(vit_3)
representation_3 = layers.Flatten()(representation_3)
representation_3 = layers.Dropout(0.5)(representation_3)
features_3 = mlp(representation_3, hidden_units=mlp_head_units, dropout_rate=0.5)
representation_4 = layers.LayerNormalization(epsilon=1e-6)(vit_4)
representation_4 = layers.Flatten()(representation_4)
representation_4 = layers.Dropout(0.5)(representation_4)
features_4 = mlp(representation_4, hidden_units=mlp_head_units, dropout_rate=0.5)
representation_5 = layers.LayerNormalization(epsilon=1e-6)(vit_5)
representation_5 = layers.Flatten()(representation_5)
representation_5 = layers.Dropout(0.5)(representation_5)
features_5 = mlp(representation_5, hidden_units=mlp_head_units, dropout_rate=0.5)
vid0_Model = tf.keras.models.Model(inputs = input_0, outputs = features_0)
vid1_Model = tf.keras.models.Model(inputs = input_1, outputs = features_1)
vid2_Model = tf.keras.models.Model(inputs = input_2, outputs = features_2)
vid3_Model = tf.keras.models.Model(inputs = input_3, outputs = features_3)
vid4_Model = tf.keras.models.Model(inputs = input_4, outputs = features_4)
vid5_Model = tf.keras.models.Model(inputs = input_5, outputs = features_5)
concat_Layer = tf.keras.layers.concatenate([vid0_Model.output, vid1_Model.output, vid2_Model.output, vid3_Model.output, vid4_Model.output, vid5_Model.output])
final_layer = tf.keras.layers.Dense(32, activation='relu')(concat_Layer)
final_layer = tf.keras.layers.Dense(6)(final_layer)
model = tf.keras.Model(inputs=[input_0, input_1, input_2, input_3, input_4, input_5], outputs=final_layer)
return model