Python 爬虫 - 爬取数据存储到MySQL
2019-01-07 本文已影响0人
莫名ypc
import pymysql
# 获取数据库连接
def get_db_con():
host = '127.0.0.1'
port = 3306
user = 'root'
password = '123456'
database = 'spider'
con = pymysql.connect(host, user, password, database, charset='utf8', port=port)
return con
# 获取游标
def get_cursor(con):
return con.cursor()
# 关闭连接
def close_connection(con):
con.close()
# 执行插入语句
def insert_movie(one_movie_dict, con, cursor):
sql = "insert into maoyan_movie (title, actor, release_time) values ('%s', '%s', '%s')" % (one_movie_dict['title'],
one_movie_dict['actor'],
one_movie_dict['release_time'],)
print(sql)
cursor.execute(sql)
con.commit()
def main():
con = get_db_con()
cursor = get_cursor(con)
m_dict = {
'title': '霸王别姬',
'actor': '张国荣',
'release_time': '2019-01-03',
}
try:
insert_movie(m_dict, con, cursor)
finally:
close_connection(con)
if __name__ == '__main__':
main()
import json
from time import sleep
import requests
from lxml import etree
def get_all_page():
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
for i in range(306):
sleep(2)
url = 'https://www.douban.com/group/explore?start='
page = i * 30
url += str(page)
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.content.decode('utf-8')
print(f'第{i + 1}页')
parse_html(text)
return None
def strips(l):
result_list = []
for item in l:
result_list.append(item.strip())
return result_list
# 保存json数据
def save_json(result_list):
result_json_str = json.dumps(result_list, ensure_ascii=False)
with open('douban.json', 'a', encoding='utf-8') as f:
f.write(result_json_str)
def parse_html(html):
result_list = []
etree_html = etree.HTML(html)
channel_result = etree_html.xpath('//div[@class="channel-item"]')
for channel in channel_result:
item = {}
title = channel.xpath('./div[@class="bd"]/h3/a/text()')[0]
url = channel.xpath('./div[@class="bd"]/h3/a/@href')[0]
likes = channel.xpath('./div[@class="likes"]/text()')[0]
come_from = channel.xpath('./div[@class="bd"]/div[@class="source"]/span[@class="from"]/a/text()')[0]
pubtime = channel.xpath('./div[@class="bd"]/div[@class="source"]/span[@class="pubtime"]/text()')[0]
content = channel.xpath('./div[@class="bd"]/div[@class="block"]/p/text()')[0]
image = channel.xpath('./div[@class="bd"]/div[@class="block"]/div[@class="pic"]/div[@class="pic-wrap"]/img/@src')
if title:
title = title[0]
else:
title = ''
if content:
content = content[0]
else:
image = ''
if image:
image = image[0]
else:
image = ''
item['title'] = title
item['url'] = url
item['likes'] = likes
item['come_from'] = come_from
item['pubtime'] = pubtime
item['content'] = content
item['image'] = image
# 插入数据库
result_list.append(item)
# save_json(result_list)
def main():
get_all_page()
if __name__ == '__main__':
main()