Python 爬虫 - 爬取数据存储到MySQL

2019-01-07  本文已影响0人  莫名ypc
import pymysql


# 获取数据库连接
def get_db_con():
    host = '127.0.0.1'
    port = 3306
    user = 'root'
    password = '123456'
    database = 'spider'
    con = pymysql.connect(host, user, password, database, charset='utf8', port=port)
    return con


# 获取游标
def get_cursor(con):
    return con.cursor()


# 关闭连接
def close_connection(con):
    con.close()


# 执行插入语句
def insert_movie(one_movie_dict, con, cursor):
    sql = "insert into maoyan_movie (title, actor, release_time) values ('%s', '%s', '%s')" % (one_movie_dict['title'],
                                                                                               one_movie_dict['actor'],
                                                                                               one_movie_dict['release_time'],)
    print(sql)
    cursor.execute(sql)
    con.commit()


def main():
    con = get_db_con()
    cursor = get_cursor(con)
    m_dict = {
        'title': '霸王别姬',
        'actor': '张国荣',
        'release_time': '2019-01-03',
    }
    try:
        insert_movie(m_dict, con, cursor)
    finally:
        close_connection(con)


if __name__ == '__main__':
    main()

import json
from time import sleep

import requests
from lxml import etree


def get_all_page():
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
    }
    for i in range(306):
        sleep(2)
        url = 'https://www.douban.com/group/explore?start='
        page = i * 30
        url += str(page)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            text = response.content.decode('utf-8')
            print(f'第{i + 1}页')
            parse_html(text)
    return None


def strips(l):
    result_list = []
    for item in l:
        result_list.append(item.strip())
    return result_list


# 保存json数据
def save_json(result_list):
    result_json_str = json.dumps(result_list, ensure_ascii=False)
    with open('douban.json', 'a', encoding='utf-8') as f:
        f.write(result_json_str)


def parse_html(html):
    result_list = []
    etree_html = etree.HTML(html)
    channel_result = etree_html.xpath('//div[@class="channel-item"]')
    for channel in channel_result:
        item = {}
        title = channel.xpath('./div[@class="bd"]/h3/a/text()')[0]
        url = channel.xpath('./div[@class="bd"]/h3/a/@href')[0]
        likes = channel.xpath('./div[@class="likes"]/text()')[0]
        come_from = channel.xpath('./div[@class="bd"]/div[@class="source"]/span[@class="from"]/a/text()')[0]
        pubtime = channel.xpath('./div[@class="bd"]/div[@class="source"]/span[@class="pubtime"]/text()')[0]
        content = channel.xpath('./div[@class="bd"]/div[@class="block"]/p/text()')[0]
        image = channel.xpath('./div[@class="bd"]/div[@class="block"]/div[@class="pic"]/div[@class="pic-wrap"]/img/@src')
        if title:
            title = title[0]
        else:
            title = ''
        if content:
            content = content[0]
        else:
            image = ''
        if image:
            image = image[0]
        else:
            image = ''
        item['title'] = title
        item['url'] = url
        item['likes'] = likes
        item['come_from'] = come_from
        item['pubtime'] = pubtime
        item['content'] = content
        item['image'] = image

        # 插入数据库

        result_list.append(item)
    # save_json(result_list)


def main():
    get_all_page()


if __name__ == '__main__':
    main()
上一篇下一篇

猜你喜欢

热点阅读