標籤傳播演算法——llgc

演算法 · 發表 2019-02-19 18:33:00

摘要：動手實踐標籤傳播演算法初始化演算法載入一些必備的庫： from IPython.display import set_matplotlib_formats %matplotlib inline #set_matplotlib_formats('svg', ...

動手實踐標籤傳播演算法

初始化演算法

載入一些必備的庫：

from IPython.display import set_matplotlib_formats
%matplotlib inline
#set_matplotlib_formats('svg', 'pdf')

import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.datasets import make_moons

save_dir = '../data/images'

建立一個簡單的資料集

利用 make_moons 生成一個半月形資料集。

n = 800# 樣本數
n_labeled = 10 # 有標籤樣本數
X, Y = make_moons(n, shuffle=True, noise=0.1, random_state=1000)

X.shape, Y.shape

((800, 2), (800,))

def one_hot(Y, n_classes):
'''
對標籤做 one_hot 編碼

引數
=====
Y: 從 0 開始的標籤
n_classes: 類別數
'''
out = Y[:, None] == np.arange(n_classes)
return out.astype(float)

color = ['red' if l == 0 else 'blue' for l in Y]
plt.scatter(X[:, 0], X[:, 1], color=color)
plt.savefig(f"{save_dir}/bi_classification.pdf", format='pdf')
plt.show()

Y_input = np.concatenate((one_hot(Y[:n_labeled], 2), np.zeros((n-n_labeled, 2))))

演算法過程：

Step 1: 建立相似度矩陣 W

def rbf(x, sigma):
return np.exp((-x)/(2* sigma**2))

sigma = 0.2
dm = cdist(X, X, 'euclidean')
vfunc = np.vectorize(rbf)
vfunc = np.vectorize(rbf)
W = vfunc(dm, sigma)
np.fill_diagonal(W, 0)# 對角線全為 0

Step 2: 計算 S

\[ S = D^{-\frac{1}{2}} W D^{-\frac{1}{2}} \]

向量化程式設計：

def calculate_S(W):
d = np.sum(W, axis=1) 
D_ = np.sqrt(d*d[:, np.newaxis]) # D_ 是 np.sqrt(np.dot(diag(D),diag(D)^T))
return np.divide(W, D_, where=D_ != 0)


S = calculate_S(W)

迭代一次的結果

alpha = 0.99
F = np.dot(S, Y_input)*alpha + (1-alpha)*Y_input

Y_result = np.zeros_like(F)
Y_result[np.arange(len(F)), F.argmax(1)] = 1

Y_v = [1 if x == 0 else 0 for x in Y_result[0:,0]]

color = ['red' if l == 0 else 'blue' for l in Y_v]
plt.scatter(X[0:,0], X[0:,1], color=color)
#plt.savefig("iter_1.pdf", format='pdf')
plt.show()

Step 3: 迭代 F "n_iter" 次直到收斂

n_iter = 150
F = Y_input
for t in range(n_iter):
F = np.dot(S, F)*alpha + (1-alpha)*Y_input

Step 4: 畫出最終結果

Y_result = np.zeros_like(F)
Y_result[np.arange(len(F)), F.argmax(1)] = 1

Y_v = [1 if x == 0 else 0 for x in Y_result[0:,0]]

color = ['red' if l == 0 else 'blue' for l in Y_v]
plt.scatter(X[0:,0], X[0:,1], color=color)
#plt.savefig("iter_n.pdf", format='pdf')
plt.show()

from sklearn import metrics

print(metrics.classification_report(Y, F.argmax(1)))

acc = metrics.accuracy_score(Y, F.argmax(1))
print('準確度為',acc)

precisionrecallf1-scoresupport

01.000.860.92400
10.881.000.93400

micro avg0.930.930.93800
macro avg0.940.930.93800
weighted avg0.940.930.93800

準確度為 0.92875

sklearn 實現 lgc

參考：https://scikit-learn.org/stable/modules/label_propagation.html

在 sklearn 裡提供了兩個 lgc 模型： LabelPropagation 和 LabelSpreading ，其中後者是前者的正則化形式。 \(W\) 的計算方式提供了 rbf 與 knn 。

rbf 核由引數 gamma 控制（ \(\gamma=\frac{1}{2{\sigma}^2}\) ）
knn 核由引數 n_neighbors （近鄰數）控制

def pred_lgc(X, Y, F, numLabels):
from sklearn import preprocessing 
from sklearn.semi_supervised import LabelSpreading
cls = LabelSpreading(max_iter=150, kernel='rbf', gamma=0.003, alpha=.99)
# X.astype(float) 為了防止報錯 "Numerical issues were encountered "
cls.fit(preprocessing.scale(X.astype(float)), F)
ind_unlabeled = np.arange(numLabels, len(X))
y_pred = cls.transduction_[ind_unlabeled]
y_true = Y[numLabels:].astype(y_pred.dtype)
return y_true, y_pred

Y_input = np.concatenate((Y[:n_labeled], -np.ones(n-n_labeled)))
y_true, y_pred = pred_lgc(X, Y, Y_input, n_labeled)
print(metrics.classification_report(Y, F.argmax(1)))

precisionrecallf1-scoresupport

01.000.860.92400
10.881.000.93400

micro avg0.930.930.93800
macro avg0.940.930.93800
weighted avg0.940.930.93800

networkx 實現 lgc

先談談 networkx 的 graph 與其他資料格式轉換（參考：https://www.cnblogs.com/wushaogui/p/9240941.html）：

從字典生成圖

import networkx as nx
from networkx.algorithms import node_classification

dod = {0: {1: {'weight': 1}}}
G = nx.from_dict_of_dicts(dod)#或G=nx.Graph(dpl) 
plt.subplots(1,1,figsize=(6,3)) 
nx.draw(G) 
plt.axis('on') 
plt.xticks([]) 
plt.yticks([]) 
plt.show() 
 
#圖轉換為字典 
print(nx.to_dict_of_dicts(G))

{0: {1: {'weight': 1}}, 1: {0: {'weight': 1}}}

從列表中建立graph

dol = {0: [1, 2, 3]}
edgelist = [(0, 1), (0, 3), (2, 3)]

G1 = nx.from_dict_of_lists(dol)# 或G=nx.Graph(dol)
G2 = nx.from_edgelist(edgelist)

# 顯示graph
plt.subplots(1, 2, figsize=(5, 3))
plt.subplot(121)
nx.draw(G1, with_labels=True, font_weight='bold')
plt.axis('on')
plt.xticks([])
plt.yticks([])
plt.subplot(122)
nx.draw(G2, with_labels=True, font_weight='bold')
plt.axis('on')
plt.xticks([])
plt.yticks([])
plt.show()

# graph轉list
print(nx.to_dict_of_lists(G1))
print(nx.to_edgelist(G1))

{0: [1, 2, 3], 1: [0], 2: [0], 3: [0]}
[(0, 1, {}), (0, 2, {}), (0, 3, {})]

graph 與 numpy

#從numpy建立graph 
import numpy as np 
a = np.reshape(np.random.random_integers(0, 1, size=100), (10, 10)) 
D = nx.DiGraph(a) 
nx.draw(D, with_labels=True, font_weight='bold') 
plt.axis('on') 
plt.xticks([]) 
plt.yticks([]) 
plt.show() 
 
#graph返回numpy 
G=nx.Graph() 
G.add_edge(1, 2, weight=7.0, cost=5) 
A1 = nx.to_numpy_matrix(G) 
A2 = nx.to_numpy_recarray(G, dtype=[('weight', float), ('cost', int)]) 
print(A1,A2)

[[0. 7.]
 [7. 0.]] [[(0., 0) (7., 5)]
 [(7., 5) (0., 0)]]

從 scipy 建立 graph

#從scipy建立graph 
G.clear() 
import scipy as sp 
A = sp.sparse.eye(2, 2, 1) 
G = nx.from_scipy_sparse_matrix(A) 
nx.draw(D, with_labels=True, font_weight='bold') 
plt.axis('on') 
plt.xticks([]) 
plt.yticks([]) 
plt.show() 
 
#graph返回scipy 
A = nx.to_scipy_sparse_matrix(G) 
print(A.todense())

[[0. 1.]
 [1. 0.]]

graph 與 Pandas

#從pandas建立graph 
G.clear() 
import pandas as pd 
df = pd.DataFrame([[1, 1], [2, 1]]) 
G = nx.from_pandas_adjacency(df) 
nx.draw(D, with_labels=True, font_weight='bold') 
plt.axis('on') 
plt.xticks([]) 
plt.yticks([]) 
plt.show() 
 
#graph返回scipy 
df = nx.to_pandas_adjacency(G)
print(df)

01
01.02.0
12.01.0

參考： networkx.algorithms.node_classification.lgc.local_and_global_consistency 具體的細節，我還沒有研究！先放一個簡單的例子：

G = nx.path_graph(4)
G.node[0]['label'] = 'A'
G.node[3]['label'] = 'B'
G.nodes(data=True)

G.edges()

predicted = node_classification.local_and_global_consistency(G)
predicted

['A', 'A', 'B', 'B']

更多精彩內容見： DecodePaper 覺得有用，記得給個 star !(@ DecodePaper )