over 1 year ago

賣鮮花的漂亮女孩在買鮮花

對於上面一段話來說,要如何提取關鍵字?

可以借助pageRank的想法在文字領域。如果上述一段話可以切分成為,

賣/鮮花/的/漂亮女孩/在/買/鮮花

選擇一個特定長度作為觀察的窗口大小(ex:3),計算針對每個詞在窗口為3之間的共現次數(停用掉常見詞彙-stopwords:的/在),表示成下表

-- 鮮花 漂亮女孩
0 1 1 1
鮮花 1 1 1 1
漂亮女孩 1 2 0 1
1 2 1 0

將上表轉換成詞彙之間的權重關係圖,

假設每一個詞的得分TextRank為TR,d為阻尼係數則

初始值先給定1後,經過不斷的迭代後此值會趨於收斂。

-- 初始值 1次迭代 2次迭代 3次迭代 4次迭代
1 0.97 0.94 0.93 0.92
鮮花 1 1.38 1.33 1.31 1.29
漂亮女孩 1 0.96 0.94 0.93 0.92
1 0.96 0.94 0.92 0.92

所以以此例來說,會找到關鍵字為鮮花

程式碼實作

首先必須產生共現詞的關係圖,利用defauldict來產生dict of dict結構。輸入一串還未分詞的短句,能完成分詞且自動建立分詞後的詞與詞的共現(Coocurance)關係。這邊利用小巧好用的jieba來作分詞。

def coocurance(text,windows=3):

    word_lst = [e for e in jieba.lcut(text) if e not in STOP_WORDS]

    # print '/'.join(word_lst)

    data = defaultdict(Counter)
    for i,word in enumerate(word_lst):
        indexStart = i - windows
        indexEnd = i + windows
        if indexStart < 0:
            temp = Counter(word_lst[:windows+1+i])
            temp.pop(word)
            data[word] += temp
            # print word

        elif indexStart>=0 and indexEnd<=len(word_lst):
            temp = Counter(word_lst[i-windows:i+windows+1])
            temp.pop(word)
            data[word] += temp
            # print word

        else:
            temp = Counter(word_lst[i-windows:])
            temp.pop(word)
            data[word]+=temp
            # print word

    return data

接著就是實作基於共現圖關係(Coocurance)的TextRank計算,方式如前面圖表所述。

def textRank(graph,d=0.85,kw_num=3,with_weight=False):
    TR = defaultdict(float,[(word,1.) for word,cooc in graph.items()]) # TextRank graph with default 1

    TR_prev = TR.copy()
    err = 1e-4
    error = 1
    iter_no = 100
    index = 1
    while (iter_no >index and  error > err):
        error = 0
        TR_prev = TR.copy()
        for word,cooc in graph.items():
            temp=0
            for link_word,weight in cooc.items():
                temp += d*TR[link_word]*weight/sum(graph[link_word].values())
                # print 'temp:',temp

            TR[word] = 1 -d + temp
            print 'word:%s,TR:%.2f'%(word.encode('utf8'),TR[word])

            # print 'TR[{}]:{}'.format(word.encode('utf8'),TR[word])

            # print '----'

        error += (TR[word] - TR_prev[word])**2
        print '-'*40
        # print 'keywords finding...iter_no:{},\terror:{}'.format(index,error)

        index += 1
    if with_weight:
        kw = sorted(TR.iteritems(),key=lambda (k,v):(v,k),reverse=True)
        kw = [(k,v/max(zip(*kw)[1])) for k,v in kw ][:kw_num]
    else:
        kw = [word for word,weight in sorted(TR.iteritems(),key=lambda (k,v):(v,k),reverse=True)[:kw_num]]
    return kw

主函式在此

from __future__ import division
import numpy as np
import jieba
import re
import jieba.analyse
from collections import Counter,defaultdict

file_name = 'dict/stopwords_tw.txt'
# jieba.analyse.set_stop_words(file_name)

jieba.set_dictionary('dict/dict.txt.big')

if __name__ == '__main__':

    ## load stop words ##

    with open(file_name) as f:
        doc = f.read()
    doc = doc.decode('utf8')
    doc = re.sub('\r\n','\n',doc)
    global STOP_WORDS
    STOP_WORDS = doc.split('\n')
    #


    text = u'賣鮮花的漂亮女孩在買鮮花'
    data = coocurance(text,windows=5)
    # # keywords

    kw = textRank(data,d=0.75,kw_num=3)

接著能基於TextRank算法作文章摘要。交給另外一篇文章作說明。

← 搜尋引擎(三)網頁排序 文章摘要-TextRank算法 →
 
comments powered by Disqus