第一个简陋的爬虫

2018-11-08  本文已影响0人  值得_e36c

想要爬取的网址:二手手机论坛https://itbbs.pconline.com.cn/es/f240027.html

爬取内容:对市场上手机种类爬取以及进行流行程度分析

该论坛界面展示:

image.png

代码实现:

import requests
from bs4 import BeautifulSoup
import jieba
from collections import Counter
import pygal
import chardet

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        print(r.encoding)
        return r.text
    except:
        return ""

def getImportantText(soup):
    taglist = soup.select('li > div > span > a[target="_blank"]')
    text = ""
    for i in taglist:
        text = text + i.string
    return text

def manageFirst(url):
    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    text = getImportantText(soup)
    return text

def draw(text):
    phoneList = ['苹果', '华为', '荣耀', '魅族', '三星', '小米', 'vivo', 'oppo']
    jieba.load_userdict(phoneList)
    words = [x for x in jieba.cut(text) if len(x) >= 2]
    print(len(words))
    c = Counter(words).most_common(490)

    pie = pygal.Pie()
    for word in c:
        if word[0] in phoneList:
            print(word)
            pie.add(word[0], int(word[1]))
    pie.render_to_file("pie.svg")

def main():
    url = "https://itbbs.pconline.com.cn/es/f240027.html"
    text = manageFirst(url)
    url = "https://itbbs.pconline.com.cn/es/f240027_2.html"
    text2 = manageFirst(url)
    text = text + text2
    #print(len(text))
    draw(text)

main()

爬取结果展示:

image.png
上一篇 下一篇

猜你喜欢

热点阅读