利用sklearn包里的BIRCH算法,以iris数据集,聚类结果可视化html
代码以下算法
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster import Birch import urllib.request import sys from sklearn.manifold import TSNE import pandas as pd # import ybirch with open("F:\Tabtad\Downloads\iris\Iris.txt", 'r') as f:#打开数据文件 line = f.readlines() # data = urllib.request .urlopen(target_url),encoding = 'utf-8' xList = [] # labels = [] for data in line: # line = line.decode() row = data.strip().split(",")#切词,并将数据变成浮点形式 row = list(map(float,row)) # labels.append(row[0]) del row[0] xList.append(row) X = np.array(xList)#转为numpy的矩阵形式 print(xList) print(X) #未使用Birch以前的数据状况 plt.scatter(X[:,0], X[:,1],X[:,2], marker = 'o') plt.show() # print(labels) nrow = len(xList) ncol = len(xList[0]) print ("Number of Rows of Data = " + str(len(xList)) + '\n') sys.stdout.write("Number of Columns of Data = " + str(len(xList[1])) + '\n') #y = make_blobs(n_samples=150, n_features=4, cluster_std=[0.4, 0.3, 0.4, 0.3]) # # #设置birch函数,训练函数 model = Birch(n_clusters = 3,threshold = 0.4) y_pred = model.fit_predict(X) # print(y_pred) # 输出标签下样本数目 r1 = pd.Series(model.labels_).value_counts() print(r1)#统计各个类别的数目 # 绘图 plt.scatter(X[:,0], X[:,1],X[:,2], c= y_pred) plt.show() from sklearn import metrics print("Calinski_Harabasz Score",metrics.calinski_harabasz_score(X,y_pred)) # # help(Birch) # # print(len(y_pred))