import torch from torch import nn # in_features -> F and out_feature ->
F' in_features = ... out_feature = ... # instanciate the learnable weight matrix
W (FxF') W = nn.Parameter(torch.empty(size=(in_features, out_feature))) #
Initialize the weight matrix W nn.init.xavier_normal_(W) # multiply W and h (h
is input features of all the nodes -> NxF matrix) h_transformed = torch.mm(h,
W)
# instanciate the learnable attention parameter vector `a` a =
nn.Parameter(torch.empty(size=(2 * out_feature, 1))) # Initialize the parameter
vector `a` nn.init.xavier_normal_(a) # we obtained `h_transformed` in the
previous code snippet # calculating the dot product of all node embeddings # and
first half the attention vector parameters (corresponding to neighbor messages)
source_scores = torch.matmul(h_transformed, self.a[:out_feature, :]) #
calculating the dot product of all node embeddings # and second half the
attention vector parameters (corresponding to target node) target_scores =
torch.matmul(h_transformed, self.a[out_feature:, :]) # broadcast add e =
source_scores + target_scores.T e = self.leakyrelu(e)
connectivity_mask = -9e16 * torch.ones_like(e) # adj_mat is the N by N
adjacency matrix e = torch.where(adj_mat > 0, e, connectivity_mask) # masked
attention scores # attention coefficients are computed as a softmax over the
rows # for each column j in the attention score matrix e attention =
F.softmax(e, dim=-1)
import torch from torch import nn import torch.nn.functional as F
################################ ### GAT LAYER DEFINITION ###
################################ class GraphAttentionLayer(nn.Module): def
__init__(self, in_features: int, out_features: int, n_heads: int, concat: bool =
False, dropout: float = 0.4, leaky_relu_slope: float = 0.2):
super(GraphAttentionLayer, self).__init__() self.n_heads = n_heads # Number of
attention heads self.concat = concat # wether to concatenate the final attention
heads self.dropout = dropout # Dropout rate if concat: # concatenating the
attention heads self.out_features = out_features # Number of output features per
node assert out_features % n_heads == 0 # Ensure that out_features is a multiple
of n_heads self.n_hidden = out_features // n_heads else: # averaging output over
the attention heads (Used in the main paper) self.n_hidden = out_features # A
shared linear transformation, parametrized by a weight matrix W is applied to
every node # Initialize the weight matrix W self.W =
nn.Parameter(torch.empty(size=(in_features, self.n_hidden * n_heads))) #
Initialize the attention weights a self.a =
nn.Parameter(torch.empty(size=(n_heads, 2 * self.n_hidden, 1))) self.leakyrelu =
nn.LeakyReLU(leaky_relu_slope) # LeakyReLU activation function self.softmax =
nn.Softmax(dim=1) # softmax activation function to the attention coefficients
self.reset_parameters() # Reset the parameters def reset_parameters(self):
nn.init.xavier_normal_(self.W) nn.init.xavier_normal_(self.a) def
_get_attention_scores(self, h_transformed: torch.Tensor): source_scores =
torch.matmul(h_transformed, self.a[:, :self.n_hidden, :]) target_scores =
torch.matmul(h_transformed, self.a[:, self.n_hidden:, :]) # broadcast add #
(n_heads, n_nodes, 1) + (n_heads, 1, n_nodes) = (n_heads, n_nodes, n_nodes) e =
source_scores + target_scores.mT return self.leakyrelu(e) def forward(self, h:
torch.Tensor, adj_mat: torch.Tensor): n_nodes = h.shape[0] # Apply linear
transformation to node feature -> W h # output shape (n_nodes, n_hidden *
n_heads) h_transformed = torch.mm(h, self.W) h_transformed =
F.dropout(h_transformed, self.dropout, training=self.training) # splitting the
heads by reshaping the tensor and putting heads dim first # output shape
(n_heads, n_nodes, n_hidden) h_transformed = h_transformed.view(n_nodes,
self.n_heads, self.n_hidden).permute(1, 0, 2) # getting the attention scores #
output shape (n_heads, n_nodes, n_nodes) e =
self._get_attention_scores(h_transformed) # Set the attention score for
non-existent edges to -9e15 (MASKING NON-EXISTENT EDGES) connectivity_mask =
-9e16 * torch.ones_like(e) e = torch.where(adj_mat > 0, e, connectivity_mask)
# masked attention scores # attention coefficients are computed as a softmax
over the rows # for each column j in the attention score matrix e attention =
F.softmax(e, dim=-1) attention = F.dropout(attention, self.dropout,
training=self.training) # final node embeddings are computed as a weighted
average of the features of its neighbors h_prime = torch.matmul(attention,
h_transformed) # concatenating/averaging the attention heads # output shape
(n_nodes, out_features) if self.concat: h_prime = h_prime.permute(1, 0,
2).contiguous().view(n_nodes, self.out_features) else: h_prime =
h_prime.mean(dim=0) return h_prime
1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
最后将上面所有的代码整合成一个完整的GAT模型:
class GAT(nn.Module): def __init__(self, in_features, n_hidden, n_heads,
num_classes, concat=False, dropout=0.4, leaky_relu_slope=0.2): super(GAT,
self).__init__() # Define the Graph Attention layers self.gat1 =
GraphAttentionLayer( in_features=in_features, out_features=n_hidden,
n_heads=n_heads, concat=concat, dropout=dropout,
leaky_relu_slope=leaky_relu_slope ) self.gat2 = GraphAttentionLayer(
in_features=n_hidden, out_features=num_classes, n_heads=1, concat=False,
dropout=dropout, leaky_relu_slope=leaky_relu_slope ) def forward(self,
input_tensor: torch.Tensor , adj_mat: torch.Tensor): # Apply the first Graph
Attention layer x = self.gat1(input_tensor, adj_mat) x = F.elu(x) # Apply ELU
activation function to the output of the first layer # Apply the second Graph
Attention layer x = self.gat2(x, adj_mat) return F.softmax(x, dim=1) # Apply
softmax activation function