almost 3 years ago

從RSS feed拿到的網站資料中,先作關鍵字的簡易統計(怎麼作的,改天…再說)這篇的重點是分類

"China" "kids" "music" "Yahoo"
Gothamist 0 3 3 0
GigaOM 6 0 0 2
Quick Online Tips 0 2 2 22

把網站出現的字頻當作是該網站的特徵,我們能針對不同網站的特徵,定義pearson相似度,或相似距離。距離愈近先組成一群(團結力量大),然後新群和舊群比較,距離最近的再度結黨營私…如此逐級延伸(維基百科有詳細說明)形成的樹狀圖(dendrogram)稱為層級分類。下面這兩張圖說的滿清楚的(也是來自wiki)

實作如下


my_cluster.py
# my_hierachical clustering

from math import sqrt
import ipdb
import random
import numpy as np
def readfile(filename):
    # readfile from 'blogdata.txt'

    lines = [line for line in file(filename)]

    wordlists = lines[0].strip().split('\t')[1:]
    bloglists = []
    data = []
    for j,line in enumerate(lines):
        if j==0:continue
        p = line.strip().split('\t')
        bloglists.append(p[0])
        
        data.append([float(x) for x in p[1:]])

    return bloglists,wordlists,data

def pearson(v1,v2):
    # pearson similarity distance,

    # return: 0<= 1-pearson <= 2, when this value is minimize means PERFECT MATCH


    # simple sums

    sum1 = sum(v1)
    sum2 = sum(v2)
    
    # sums of the squres

    sum1Sq = sum([pow(v,2) for v in v1])
    sum2Sq = sum([pow(v,2) for v in v2])
    
    # sums of the products

    pSum= sum([v1[i]*v2[i] for i in range(len(v1))])
    
    # calculate r (Pearson score)

    
    num = pSum -(sum1*sum2/len(v1))
    den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
    
    if den ==0:
        return 0
    return 1.0 - num/den # 0<= 1-pearson <= 2, when this value is minimize means PERFECT MATCH


class bicluster:
    def __init__(self,vec,left=None,right=None,distance=0.0,id=None):

        self.vec=vec
        self.left=left
        self.right=right
        self.distance=distance
        self.id= id


def hcluster(rows,distance=pearson):

    distances={}
    currentclustid =-1

    # clusters are initially just the rows

    clust = [bicluster(row,id=i) for i,row in enumerate(rows)]
    
    while len(clust)>1:
        lowestpair =(0,1)
        closest = distance(clust[0].vec,clust[1].vec)

        ## loop through every pair looking for the smallest distance

        for i in range(len(clust)):
            for j in range(i+1,len(clust)):
                # distance in the cache of distances calculation

                
                if (clust[i].id,clust[j].id) not in distances:
                    distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)

                d = distances[(clust[i].id,clust[j].id)]                    
                # ipdb.set_trace()

                if d<closest:
                    closest = d 
                    lowestpair = (i,j)

        print 'closest:{},lowestpair:{} '.format(closest,lowestpair)

        # calculate the average of the two clusters

        #  calculate the average of the two clusters

        mergevec = [
        (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0
        for i in range(len(clust[0].vec))
        ]
        
        # create the new clustering

        newcluster=bicluster(mergevec,left=clust[lowestpair[0]],
        right=clust[lowestpair[1]],
        distance=closest,id=currentclustid)
        
        # cluster ids that weren't in the original set are negative

        currentclustid-=1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)
        
    return clust[0]

### 自己寫了一個kmean比較一下。 OS:根本無法比較…每次結果不同==凸

def kmeans(rows,distance=pearson,k=10):
    # pick k of samples as starting centroid randomly 

    centroids=random.sample(rows,k)
    
    # loop through every point to specify which cluster is the best

    for iterno in range(100):
        KCluster= [[] for i in range(k)]        
        for id,row in enumerate(rows):
            d =[]
            for i in range(k):
                d.append(distance(row,centroids[i])) # distance relative to k centroid

            cluster_index = d.index(min(d)) #choose best group (min)

            KCluster[cluster_index].append(row) # put into different KCluster


        ## modify the centroids point

        
        mean_centroids = [np.mean(KCluster[j],axis=0).tolist() for j in range(k)]

        if centroids == mean_centroids: break
        centroids = mean_centroids
        print "iteration number: {}".format(iterno)

    # to match the id in KCluster

    idCluster =[[] for i in range(k)]
    for ii in range(k):

        for jj in range(len(KCluster[ii])):
            # ipdb.set_trace()

            idCluster[ii].append(rows.index(KCluster[ii][jj]))

    return KCluster,idCluster


    


if __name__ =='__main__':

    blognames,wordnames,data = readfile('blogdata.txt')
    hclust = hcluster(data)

ps:

  1. 用sklearn實現層級演算法,叫做0.1秒找答案。詳情請看這裡

  2. 目前不知道怎麼視覺化樹狀圖,如果以後會了要來補完一下。

  3. 機器學習演算法學到現在感覺有點玩假的,用from sklearn import xxx就解決了,自己徒手硬幹演算法。寫得久也算得慢。最慘的是內行人看應該都覺得是小兒科吧…唉…哭哭

← 推薦系統實作(三)-以商品為基礎 搜尋引擎(一)網頁爬梳 →
 
comments powered by Disqus