# 聚类

（篇幅所限，大多数代码就不贴了，文末给出代码地址。）

km = KMeans(n_clusters=8)
km.fit(X)


# 大簇小簇

∣ C 1 ∣ ≥ ∣ C 2 ∣ ≥ ∣ C 3 ∣ ≥ . . . ≥ ∣ C k ∣ |C_1|\ge|C_2|\ge|C_3|\ge...\ge|C_k|

## 突降

pyod是这样实现的：能找到同时满足两个条件的分割最好。次优是找到满足绝对多数原则的分割。最差是只找到了满足突降原则的分割。

large_clusters=[]
small_clusters=[]
found_b= False
count=0
clusters = df_cluster_sizes['cluster'].values
n_clusters = len(clusters)
sizes = df_cluster_sizes['size'].values
for i in range(n_clusters):
print(f"-----------iterration {i}--------------")
satisfy_alpha=False
satisfy_beta=False
if found_b:
small_clusters.append(clusters[i])
continue
count+=sizes[i]
print(count)
if count>n_points_in_large_clusters:
satisfy_alpha=True
print(sizes[i]/sizes[i+1])
if i<n_clusters-1 and sizes[i]/sizes[i+1]>beta:
print("beta")
satisfy_beta=True
print(satisfy_alpha, satisfy_beta)
if satisfy_alpha and satisfy_beta:
found_b=True
large_clusters.append(clusters[i])


# Factor 因子

def get_distance(a,b):
return np.sqrt((a[0]-b[0])**2 + (a[1]-b[1])**2)
def decision_function(X, labels):
n=len(labels)
distances=[]
for i in range(n):
p=X[i]
label = labels[i]
if label in large_clusters:
center = km.cluster_centers_[label]
d=get_distance(p, center)
else:
d=None
for center in large_cluster_centers:
d_temp = get_distance(p, center)
if d is None:
d=d_temp
elif d_temp<d:
d=d_temp
distances.append(d)
distances=np.array(distances)
return distances
distances = decision_function(X, km.labels_)


threshold=np.percentile(distances, 99)
print(f"threshold is {threshold}")
anomaly_labels = (distances>threshold)*1


(上图代码抄的Susan Li的文章)

# 源代码

https://github.com/EricWebsmith/machine_learning_from_scrach

# 参考

He, Zengyou, Xiaofei Xu, and Shengchun Deng. “Discovering cluster-based local outliers.” Pattern Recognition Letters 24.9-10 (2003): 1641-1650.

https://github.com/yzhao062/pyod

towardsdatascience.com 网站Susan Li的文章 Anomaly Detection for Dummies

https://blog.csdn.net/juwikuang/article/details/108699190