almost 3 years ago

1111科技唬爛公司,想分析應徵人員與是否錄取的關係性。假設手邊有面試人員的歷史資料,特徵包含:phd,sweets, level, lang。我們想知道,該如何預測下個人是否能錄取?進一步思考該找什麼樣的人來公司面試?

資料結構如下,

[
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
        ...       
       ]

技巧

  1. 判斷在哪個特徵(best_feature)下,子群具有最小熵(entropy)
  2. 回傳結構 ( best_feature, subtrees)
  3. 子樹(subtress) 透過recursive迭代,往下尋找。終止條件為subtrees全為True或全為False或特徵集合為空。

以此方法即可建立出如下的判斷樹。


程式碼

## my own decision tree


from __future__ import division
from collections import Counter,defaultdict
from functools import partial
from functools import partial
import numpy as np


inputs = [
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
    ]

def entropy(class_prob):
    '''input: list of prob in single node '''
    return sum([-pi*np.log2(pi) for pi in class_prob])

def data_entropy(labels):
    total_count = len(labels)
    class_prob = [count/total_count for count in Counter(labels).values()]
    return entropy(class_prob)

def partition_by(inputs,feature):
    ''' 
    return datasets group by feature in a defaultdict form
    '''
    groups = defaultdict(list)
    for e in inputs:
        key = e[0][feature]
        groups[key].append(e)

    return groups

def entropy_by_partition(inputs,feature):
    '''
    return the entropy of inputs by different features
    '''
    attributes,labels = zip(*inputs)
    attributes_lists = [e for e in attributes]
    # partition by feature

    subsets = Counter([e[feature] for e in attributes_lists])
    total_numbers_in_subsets = sum(subsets.values())

    entropy_total = 0

    for key in subsets.keys():
        labelInsubset = [e for _,e in partition_by(inputs,feature).get(key)]
        partition_probility = len(labelInsubset)/total_numbers_in_subsets

        entropy_total += partition_probility*data_entropy(labelInsubset)

    return entropy_total


for e in ['lang','level','tweets','phd']:
    print e,entropy_by_partition(inputs,e)


def build_tree(inputs,split_features=None):

    features = inputs[0][0].keys()
    if split_features is None:    
        split_features = features

    # count Trues and Falses in the inputs

    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues
    
    if num_trues == 0:                  # if only Falses are left

        return False                    # return a "False" leaf

        
    if num_falses == 0:                 # if only Trues are left

        return True                     # return a "True" leaf


    if not split_features:            # if no split candidates left

        return num_trues >= num_falses  # return the majority leaf

                            
    # find best feature     

    best_feature = min(split_features,key=partial(entropy_by_partition,inputs))

    # partition by best feature

    groups = partition_by(inputs,best_feature)

    # subtrees = defaultdict(list)

    new_candidate = [e for e in split_features if e!= best_feature]
    # subgroup by different keys


    subtrees = {key:build_tree(subset,new_candidate) 
                for key,subset in groups.iteritems()}
    # for key,subset in groups.iteritems():

    #     subtrees[key] = build_tree(subset,new_candidate)


    #   subtrees[None] = num_trues > num_falses # default case


    return (best_feature, subtrees)
← 決策樹 Decision Tree(一)-基本原理 ARIMA時間模型(非季節性) →
 
comments powered by Disqus