糗事百科简单爬虫

2017-11-17  本文已影响4人  咸鱼佬

菜鸡局


  1. BeautifulSoup
  2. urllib.request
  3. pymysql

数据库

mysql
表结构

CREATE TABLE `joke` (
    content VARCHAR(2048)
);

连接数据库

import pymysql.cursors


def insert_db(data):
    connect = pymysql.Connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='',
        db='qsbk',
        charset='utf8')

    cursor = connect.cursor()
    sql = "insert into joke(content) VALUES (%s)"
    cursor.executemany(sql, data)
    connect.commit()
    cursor.close()
    connect.close()

获取数据

from bs4 import BeautifulSoup
import urllib.request
import Python01
import threading
import time


# path = os.getcwd() + '/' + 'data.txt'


# def save_date(str):
#     str = str + '\n\r'
#     f = open(r'C:\python\HelloWorld\data.txt', 'ab')
#     f.write(str.encode('utf-8'))
#     f.close()

class GetDataThread(threading.Thread):
    def __init__(self, page):
        threading.Thread.__init__(self)
        self.page = page

    def run(self):
        self.get_data()

    def get_data(self):
        url = 'https://www.qiushibaike.com/hot/page/' + str(self.page)

        headers = {
            'User-Agent': r'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3163.100 Safari/537.30'}

        request = urllib.request.Request(url, headers=headers)

        try:

            response = urllib.request.urlopen(request)

        except urllib.request.HTTPError:
            print(self.page)

        bsObje = BeautifulSoup(response.read(), 'html.parser')

        store = bsObje.find_all('div', {'class': 'content'})

        data_list = []

        for tem in store:
            data = tem.get_text().strip()
            if 20 < len(data) and '查看全文' not in data:
                print(data)
                data_list.append(data)

        Python01.insert_db(data_list)


for i in range(1, 8):
    my_thread = GetDataThread(i)
    my_thread.start()
    my_thread.join()    
image
上一篇下一篇

猜你喜欢

热点阅读