over 2 years ago

延續之前實作(二)的問題,如果使用以用戶為基礎的相似度,來作判斷會出現如簡介所說

  • 表單上大部分的用戶欄位是空的
  • 必須用戶同時上線,才能計算用戶的相似性

以商品為基礎

對Toby來說的商品推薦,利用皮爾森相似度計算每個商品之間的相似性。然後針對不同評分給予商品分數,

電影 評分 Night相似度 評分*Night相似度 Lady相似度 評分*Lady相似度 Luck相似度 評分*Luck相似度
Snakes on a Plane 4.5 -0.56 - 0.76 3.42 - -
Superman Return 4 -0.18 - 0.48 1.92 - -
You, Me and Dupree 1 -0.25 - 0.33 0.33 - -
總分/相似度之和 3.61

不推薦商品相似度的品項。所以我們只會推薦Lady in the Water(與之前的結果比較很不一樣)

recommendation_pearson.py
## using pearson distance to evalute the similarity

## collaborative filtering based on 1) users 2)items 


# 2016.02.05


# module 

from collections import defaultdict
import pandas as pd
import numpy as np

### data ####

critics = {'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
                         'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
                         'The Night Listener': 3.0},
           'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
                            'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
                            'You, Me and Dupree': 3.5},
           'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
                                'Superman Returns': 3.5, 'The Night Listener': 4.0},
           'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
                            'The Night Listener': 4.5, 'Superman Returns': 4.0,
                            'You, Me and Dupree': 2.5},
           'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                            'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
                            'You, Me and Dupree': 2.0},
           'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
                             'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
           'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0}}


data = pd.DataFrame(critics)
dataItem = data.T
## return the similarity score from two person


def sim_distance(data,person1,person2):
    # using Euclidean distance to find score

    square = (data[person1] - data[person2])**2
    sumOfsquare =  np.sum(square.fillna(0))
    distance = np.sqrt(sumOfsquare)
    score = 1/(1 + distance)

    return score

def sim_pearson(data,p1,p2):
    # return pearson similarity between two persons

    p1Lists = data[p1]
    p2Lists = data[p2]
    # choose not null index in each personal Lists


    p1index = p1Lists[p1Lists.notnull()].index
    p2index = p2Lists[p2Lists.notnull()].index
    # pcikup common index 

    commonIndex = pd.Index([e for e in p1index if e in p2index]) 
    
    
    return np.corrcoef(p1Lists[commonIndex],p2Lists[commonIndex])[0,1]



def top_match(data,person,n=5, similarity= sim_pearson):
    # return the best match for pearson from the data, 

    # numbers of result(n) is optional


    scores = [(other,similarity(data,other,person)) for other in data 
                                                if other!=person] 
    scores.sort(key=lambda (_,x):x,reverse=True)

    return scores[:n]


def getRecommendations(data,person,similarity=sim_pearson):
    # Gets recommendations for a person by using a weighted average

    # of every other user's rankings


    # similarity to person

    sim = {person:{other:similarity(data,other,person) for other in data 
                                                if other!=person}}


    # ignore scores lower than 0

    sim_person = pd.DataFrame(sim)
    sim_person_include = sim_person[sim_person[person]>0].index 
    sim_person = sim_person[sim_person[person]>0]


    # item needed to be recommened to the person

    dataperson = data[person]
    itemRecommendation = dataperson[dataperson.isnull()].index 

    
    rankings = []

    for item in itemRecommendation:
        scores_from_other = data.ix[item] 
        scores_from_other = scores_from_other[sim_person_include] # exclude ignoring similarity lower than 0

        scores_from_other = scores_from_other[np.isfinite(scores_from_other)] #score from other but not NaN in person

        sim_person_count = sim_person.ix[scores_from_other.index]

        normalized_star = np.dot(scores_from_other,sim_person_count)/np.sum(sim_person)
        normalized_star = normalized_star[0]
        rankings.append( (item,normalized_star))
        rankings.sort(reverse=True)

    return rankings


### Item based Recommendation ####


def calculateSimiliarItems(data,item,n=10,similarity=sim_pearson):
    # calculate the similiarity between item and other items


     #    [('You, Me and Dupree', 0.65795169495976902),

     # ('Lady in the Water', 0.48795003647426655),

     # ('Snakes on a Plane', 0.11180339887498947),

     # ('The Night Listener', -0.1798471947990542),

     # ('Just My Luck', -0.42289003161103106)]



    result = {other:similarity(data,other,item)
                for other in data if other!= item}
    # result.sort(reverse=True,key=lambda (_,x):x)

    return result

itemsSimilarity = {item:calculateSimiliarItems(dataItem,item) for item in dataItem}
df_sim_items = pd.DataFrame(itemsSimilarity)

def getRecommendedItem(dataItem,user):

    # input:


    scoreByUser = dataItem.ix[user]
    scoreByUser = scoreByUser[scoreByUser.notnull()] # pickup not null element for user

    scoreByUserIndex = scoreByUser.index

    ## not in user's comment 

    itemsRecommendForUser = dataItem.ix[user]
    itemsRecommendForUser = itemsRecommendForUser[itemsRecommendForUser.isnull()]
    itemsRecommendForUserIndex = itemsRecommendForUser.index

    rankings =defaultdict(float)
    for recommendItem in itemsRecommendForUserIndex:
        sim_sum =0
        for userscore in scoreByUserIndex:
            if df_sim_items[recommendItem][userscore] > 0:
                sim = df_sim_items[recommendItem][userscore]
                rankings[recommendItem] += sim*scoreByUser[userscore]
                sim_sum += sim
        if sim_sum!=0:
            rankings[recommendItem] = rankings[recommendItem]/sim_sum

    return rankings



        
← 推薦系統實作(二)-以用戶為基礎 層級分類法(Hierachical Clustering) →
 
comments powered by Disqus