over 2 years ago

JazzRadio是一個線上收聽Jazz音樂的服務。手邊有自家公司員工的服務年資、年薪與是否付費訂閱公司自有產品的資料如下,


如果我們想利用這筆資料,推估在既有服務年資、年薪的基礎上,她/他可能付費訂閱的機率是多少?


技巧

  1. 觀察到服務年資與薪水有很大的差異,先將資料作標準化(standardization,即平均值為0,標準差為1) 物理上的無量綱化。
    $$
    x = \frac{x_{orig}-\bar x}{\sigma_x}
    $$
    實際上利用sklearn.preprocessing.scale(data)來處理

  2. 邊界條件(即有付費與沒付費的劃分線)發生在決定了
    $$
    \theta_0 + \theta_1 x_1 + \theta_2 x_2 = 0
    $$
    這是對平面來說,斜率為截距為的直線。


結果

  1. 資料準確度 89.2%

  2. logistic regression預測與真實資料差異

  3. 邊界條件

ps. 使用logistic regression必須為線性可分,才能分類。若無法必須透過變數轉換,使新變數可以變成線性可分問題。但是存有過度調適的問題(overfitting)必須小心!


程式碼

## logistic regression


from sklearn import linear_model, preprocessing
from logistic_reg import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt

data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1)]

x = [list(row[0:2]) for row in data]
y = [row[2] for row in data]

# rescaled data 

rescaled_x = np.ndarray.tolist(preprocessing.scale(x)) # input data in LogisticRegression need to be a list

data_cleaned = zip(rescaled_x,y)

lg = LogisticRegression(data_cleaned)
logistic = linear_model.LogisticRegression()
logistic.fit(rescaled_x,y)

## stochastic gradient descent for logistic regression


mini_batch_size = 1 # sgd

iter_no = 10**3 # no more than iter_no

tol = 10**-6 # theta variance less than tol

alpha = 10**-1 # learning rate



lg.sgd(mini_batch_size,iter_no,tol,alpha)

## plot logistic regression vs actual


print "correctness of our predictions/sklearn is :{},{}".format(lg.goodness(),logistic.score(rescaled_x,y))



def plotPredict():
    ''' Logistic Regression Predicted vs Actual'''
    predictions = lg.predict(lg.x)
    xblue_point = []
    yblue_point = []
    xred_point = []
    yred_point= []
    for i,e in enumerate(predictions):

        if e<0.5 and y[i] == 1 :
            # 'red' point : error

            xred_point.append(e)
            yred_point.append(y[i])

        if e>0.5 and y[i] == 0 :
            # 'red' point : error

            xred_point.append(e)
            yred_point.append(y[i])    
        else:
            # paint = 'blue'

            xblue_point.append(e)
            yblue_point.append(y[i])

    plt.scatter(xblue_point,yblue_point,color='blue',label = 'correct')
    plt.scatter(xred_point,yred_point,color ='red',label='error')
    plt.legend()
    plt.xlabel('predicted probability')
    plt.ylabel('actual outcome')
    plt.title('Logistic Regression Predicted vs Actual')


def plotBoundary():
    ''' Logistic Regression Decision Boundary'''

    x1r,x2r = zip(*rescaled_x)
    x1,x2 = zip(*x)
    x1mean = np.mean(x1)
    x2mean = np.mean(x2)
    x1std  = np.std(x1)
    x2std  = np.std(x2)

    paid = [xdata for xdata,pay in data_cleaned if pay==1]
    unpaid = [xdata for xdata,pay in data_cleaned if pay!=1]

    paid_exp,paid_salary = zip(*paid)
    unpaid_exp,unpaid_salary = zip(*unpaid)

    paid_exp =np.array(paid_exp)*x1std + x1mean
    unpaid_exp = np.array(unpaid_exp)*x1std + x1mean

    paid_salary = np.array(paid_salary)*x2std + x2mean
    unpaid_salary = np.array(unpaid_salary)*x2std + x2mean

    plt.scatter(unpaid_exp,unpaid_salary,marker='+',label ='unpaid')
    plt.scatter(paid_exp,paid_salary,label ='paid')

    ## boundary occurs at \theta_0 + \theta_1 x1 + \theta_2 x2 =0

    [theta0],[theta1],[theta2] = lg.theta
    x2boundary = - np.dot(theta1/theta2, x1r) - \
                    theta0/theta2
    x2boundary = np.dot(x2std,x2boundary) + x2mean
    x1boundary = np.dot(x1std,x1r) + x1mean

    # plt.plot(x1boundary,x2boundary,c='red')

    plt.legend()
    plt.xlabel('years experience')
    plt.ylabel('salary')
    plt.title('Logistic Regression Decision Boundary')
← Logistic Regression(一)數學基礎 Logistic Regression(三)Cost function →
 
comments powered by Disqus