over 2 years ago


預測的方式很簡易,從電腦書籍網站O'REILLY觀察資料類的電腦書籍出版數量,隨年份的變化趨勢,說明此領域是否逐年受到人們重視.

複習技能

  • soup後的資料取出
res = requests.get(url)
soup = BeautifulSoup(res.text)
tds = soup('td','thumbtext')

pricelabels = td('span','pricelabel')  # td 是tds 中的一個元素

title = td.find("div","thumbheader").a.text.strip()
isbn_link = td.find("div","thumbheader").a.get("href")
  • 過濾video資料

    def is_video(td):
    ''' check this target whether is video or not,
    return True, false
    '''
    try:
        pricelabels = td('span','pricelabel')
        check1 = len(pricelabels)
        check2 = pricelabels[0].text.strip().split()[0]
        return (check1 and check2==u'Video:')
    except:
        "by pass if there is no price list shown"
        pass
    
  • 利用正規表達式(Regular expression)

    isbn = re.match("/product/(.*).do",isbn_link).group(1)
    
  • 儲存單一變數在pickle檔中


    with open("oreilly-data.pickle", 'r') as f:
    books = pickle.load(f)


    完整程式碼

    ## this script crawl the oreilly bookstore try to find out the 
    
    ## growth of data-science related books.
    
    from bs4 import BeautifulSoup
    from collections import Counter
    import requests
    import html5lib
    import re
    import pickle
    import matplotlib.pyplot as plt
    
    def is_video(td):
        ''' check this target whether is video or not,
        return True, false
        '''
        try:
            pricelabels = td('span','pricelabel')
            check1 = len(pricelabels)
            check2 = pricelabels[0].text.strip().split()[0]
            return (check1 and check2==u'Video:')
        except:
            "by pass if there is no price list shown"
            pass
    
    def book_info(td):
        ''' for a given td return bookinfo '''
        title = td.find("div","thumbheader").a.text.strip()
        isbn_link = td.find("div","thumbheader").a.get("href")
        isbn = re.match("/product/(.*).do",isbn_link).group(1)
        date = td.find("span","directorydate").text.strip()
        by_author = td.find("div","AuthorName").text.strip()
        authors = re.sub("^By"," ",by_author).strip().split(',') # list
    
        return {"title":title,
                "isbn":isbn,
                "date":date,
                "authors":authors            
                }
    
    page_num =41
    
    ## main program to crawl oreilly and find all data-related books info
    
    def scrape(page_num):
        books = []
        for page in range(1,page_num+1):
    
            url ="http://shop.oreilly.com/category/browse-subjects/" + \
            "data.do?sortby=publicationDate&page=%s"%(page)
    
    
    
            res = requests.get(url)
            soup = BeautifulSoup(res.text)
            tds = soup('td','thumbtext')
    
            for td in tds:
                if not is_video(td):
                    books.append(book_info(td))
    
            noOfbooks = len(books)
            print "this is page:{}/{} soup, {} books found so far".format(page,page_num,noOfbooks)    
    
        with open("oreilly-data.pickle", 'w') as f:
            pickle.dump(books, f)
    
        return books
    
    def get_book_year_count():
        # 
    
        # lists of all books' published date 
    
        with open("oreilly-data.pickle", 'r') as f:
            books = pickle.load(f)
    
        pub_date = [book['date'].split()[1] for book in books]
        books_year_count = Counter(pub_date)
    
        return books_year_count
    
    ## drawing bar chart (books number vs pulished year)
    
    
    bookcounts = get_book_year_count()
    
    years = sorted(bookcounts.keys())
    booksNumbers = [bookcounts[year] for year in years]
    
    plt.bar(years,booksNumbers)
    plt.xlabel('publish year')
    plt.ylabel('numbers of data-related books')
    plt.show()
    
← 抽象繪圖-KMeans演算法 字數計算(words counts) →
 
comments powered by Disqus