標籤傳播演算法——llgc
動手實踐標籤傳播演算法
初始化演算法
載入一些必備的庫:
from IPython.display import set_matplotlib_formats %matplotlib inline #set_matplotlib_formats('svg', 'pdf') import numpy as np import matplotlib.pyplot as plt from scipy.spatial.distance import cdist from sklearn.datasets import make_moons save_dir = '../data/images'
建立一個簡單的資料集
利用 make_moons
生成一個半月形資料集。
n = 800# 樣本數 n_labeled = 10 # 有標籤樣本數 X, Y = make_moons(n, shuffle=True, noise=0.1, random_state=1000) X.shape, Y.shape
((800, 2), (800,))
def one_hot(Y, n_classes): ''' 對標籤做 one_hot 編碼 引數 ===== Y: 從 0 開始的標籤 n_classes: 類別數 ''' out = Y[:, None] == np.arange(n_classes) return out.astype(float)
color = ['red' if l == 0 else 'blue' for l in Y] plt.scatter(X[:, 0], X[:, 1], color=color) plt.savefig(f"{save_dir}/bi_classification.pdf", format='pdf') plt.show() Y_input = np.concatenate((one_hot(Y[:n_labeled], 2), np.zeros((n-n_labeled, 2))))
演算法過程:
Step 1: 建立相似度矩陣 W
def rbf(x, sigma): return np.exp((-x)/(2* sigma**2))
sigma = 0.2 dm = cdist(X, X, 'euclidean') vfunc = np.vectorize(rbf) vfunc = np.vectorize(rbf) W = vfunc(dm, sigma) np.fill_diagonal(W, 0)# 對角線全為 0
Step 2: 計算 S
\[ S = D^{-\frac{1}{2}} W D^{-\frac{1}{2}} \]
向量化程式設計:
def calculate_S(W): d = np.sum(W, axis=1) D_ = np.sqrt(d*d[:, np.newaxis]) # D_ 是 np.sqrt(np.dot(diag(D),diag(D)^T)) return np.divide(W, D_, where=D_ != 0) S = calculate_S(W)
迭代一次的結果
alpha = 0.99 F = np.dot(S, Y_input)*alpha + (1-alpha)*Y_input Y_result = np.zeros_like(F) Y_result[np.arange(len(F)), F.argmax(1)] = 1 Y_v = [1 if x == 0 else 0 for x in Y_result[0:,0]] color = ['red' if l == 0 else 'blue' for l in Y_v] plt.scatter(X[0:,0], X[0:,1], color=color) #plt.savefig("iter_1.pdf", format='pdf') plt.show()
Step 3: 迭代 F "n_iter" 次直到收斂
n_iter = 150 F = Y_input for t in range(n_iter): F = np.dot(S, F)*alpha + (1-alpha)*Y_input
Step 4: 畫出最終結果
Y_result = np.zeros_like(F) Y_result[np.arange(len(F)), F.argmax(1)] = 1 Y_v = [1 if x == 0 else 0 for x in Y_result[0:,0]] color = ['red' if l == 0 else 'blue' for l in Y_v] plt.scatter(X[0:,0], X[0:,1], color=color) #plt.savefig("iter_n.pdf", format='pdf') plt.show()
from sklearn import metrics print(metrics.classification_report(Y, F.argmax(1))) acc = metrics.accuracy_score(Y, F.argmax(1)) print('準確度為',acc)
precisionrecallf1-scoresupport 01.000.860.92400 10.881.000.93400 micro avg0.930.930.93800 macro avg0.940.930.93800 weighted avg0.940.930.93800 準確度為 0.92875
sklearn 實現 lgc
參考:https://scikit-learn.org/stable/modules/label_propagation.html
在 sklearn 裡提供了兩個 lgc 模型: LabelPropagation
和 LabelSpreading
,其中後者是前者的正則化形式。 \(W\) 的計算方式提供了 rbf
與 knn
。
-
rbf
核由引數gamma
控制( \(\gamma=\frac{1}{2{\sigma}^2}\) ) -
knn
核 由引數n_neighbors
(近鄰數)控制
def pred_lgc(X, Y, F, numLabels): from sklearn import preprocessing from sklearn.semi_supervised import LabelSpreading cls = LabelSpreading(max_iter=150, kernel='rbf', gamma=0.003, alpha=.99) # X.astype(float) 為了防止報錯 "Numerical issues were encountered " cls.fit(preprocessing.scale(X.astype(float)), F) ind_unlabeled = np.arange(numLabels, len(X)) y_pred = cls.transduction_[ind_unlabeled] y_true = Y[numLabels:].astype(y_pred.dtype) return y_true, y_pred
Y_input = np.concatenate((Y[:n_labeled], -np.ones(n-n_labeled))) y_true, y_pred = pred_lgc(X, Y, Y_input, n_labeled) print(metrics.classification_report(Y, F.argmax(1)))
precisionrecallf1-scoresupport 01.000.860.92400 10.881.000.93400 micro avg0.930.930.93800 macro avg0.940.930.93800 weighted avg0.940.930.93800
networkx 實現 lgc
先談談 networkx 的 graph 與其他資料格式轉換(參考:https://www.cnblogs.com/wushaogui/p/9240941.html) :
從字典生成圖
import networkx as nx from networkx.algorithms import node_classification
dod = {0: {1: {'weight': 1}}} G = nx.from_dict_of_dicts(dod)#或G=nx.Graph(dpl) plt.subplots(1,1,figsize=(6,3)) nx.draw(G) plt.axis('on') plt.xticks([]) plt.yticks([]) plt.show() #圖轉換為字典 print(nx.to_dict_of_dicts(G))
{0: {1: {'weight': 1}}, 1: {0: {'weight': 1}}}
從列表中建立graph
dol = {0: [1, 2, 3]} edgelist = [(0, 1), (0, 3), (2, 3)] G1 = nx.from_dict_of_lists(dol)# 或G=nx.Graph(dol) G2 = nx.from_edgelist(edgelist) # 顯示graph plt.subplots(1, 2, figsize=(5, 3)) plt.subplot(121) nx.draw(G1, with_labels=True, font_weight='bold') plt.axis('on') plt.xticks([]) plt.yticks([]) plt.subplot(122) nx.draw(G2, with_labels=True, font_weight='bold') plt.axis('on') plt.xticks([]) plt.yticks([]) plt.show() # graph轉list print(nx.to_dict_of_lists(G1)) print(nx.to_edgelist(G1))
{0: [1, 2, 3], 1: [0], 2: [0], 3: [0]} [(0, 1, {}), (0, 2, {}), (0, 3, {})]
graph 與 numpy
#從numpy建立graph import numpy as np a = np.reshape(np.random.random_integers(0, 1, size=100), (10, 10)) D = nx.DiGraph(a) nx.draw(D, with_labels=True, font_weight='bold') plt.axis('on') plt.xticks([]) plt.yticks([]) plt.show() #graph返回numpy G=nx.Graph() G.add_edge(1, 2, weight=7.0, cost=5) A1 = nx.to_numpy_matrix(G) A2 = nx.to_numpy_recarray(G, dtype=[('weight', float), ('cost', int)]) print(A1,A2)
[[0. 7.] [7. 0.]] [[(0., 0) (7., 5)] [(7., 5) (0., 0)]]
從 scipy 建立 graph
#從scipy建立graph G.clear() import scipy as sp A = sp.sparse.eye(2, 2, 1) G = nx.from_scipy_sparse_matrix(A) nx.draw(D, with_labels=True, font_weight='bold') plt.axis('on') plt.xticks([]) plt.yticks([]) plt.show() #graph返回scipy A = nx.to_scipy_sparse_matrix(G) print(A.todense())
[[0. 1.] [1. 0.]]
graph 與 Pandas
#從pandas建立graph G.clear() import pandas as pd df = pd.DataFrame([[1, 1], [2, 1]]) G = nx.from_pandas_adjacency(df) nx.draw(D, with_labels=True, font_weight='bold') plt.axis('on') plt.xticks([]) plt.yticks([]) plt.show() #graph返回scipy df = nx.to_pandas_adjacency(G) print(df)
01 01.02.0 12.01.0
參考: networkx.algorithms.node_classification.lgc.local_and_global_consistency 具體的細節,我還沒有研究!先放一個簡單的例子:
G = nx.path_graph(4) G.node[0]['label'] = 'A' G.node[3]['label'] = 'B' G.nodes(data=True) G.edges() predicted = node_classification.local_and_global_consistency(G) predicted
['A', 'A', 'B', 'B']
更多精彩內容見: DecodePaper 覺得有用,記得給個 star !(@ DecodePaper )