使用PyQuery爬取猫眼电影 及PyQuery使用方法-实战篇
2019-06-25 本文已影响0人
Serven_Students
讲解:
- 使用Python中PyQuery库 爬去猫眼电影 并存入数据库 、txt文档
涉及Python相关库:
from urllib.robotparserimport RobotFileParser
import requests
from pyqueryimport PyQuery
import pymysql
下面展示一段神奇的代码:
from urllib.robotparser import RobotFileParser
import requests
from pyquery import PyQuery
import pymysql
offert = 0
offert_tail = 100
def get_one_page(url):
rb = RobotFileParser()
rb.set_url(url)
rb.read()
if rb.can_fetch("*", url) == True:
reponse = requests.get(url)
if reponse.status_code == 200:
one_page(reponse.text)
def one_page(html):
doc = PyQuery(html)
aa = doc('dd').items()
item = {}
for i in aa:
indexi = i.children('i').text()
namei = i.find('.name').text()
start = i.find('.star').text()
releasetime = i.find('.releasetime').text()
score = i.find('.score').text()
item['indexi'] = indexi
item['namei'] = namei
item['start'] = start
item['releasetime'] = releasetime
item['score'] = score
write_r(str(item))
insert_mysql(item)
def main():
url = "https://maoyan.com/board/4?offset="
for i in range(offert,offert_tail,10):
urls = url + str(i)
get_one_page(urls)
def write_r(neirong):
with open('./text.txt','a',encoding='utf-8') as f:
f.write(neirong + '\n')
def mysql():
# 连接数据库
conn = pymysql.connect(host='127.0.0.1', port=3306, user='ceshi', passwd='123456',
db='django_one', charset='utf8')
# 建立游标对象
cursor = conn.cursor()
sql_table = """CREATE TABLE CESHI (
id INT PRIMARY KEY AUTO_INCREMENT,
indexi INT (12),
namei VARCHAR (225),
start VARCHAR (255),
releasetime VARCHAR (225),
score VARCHAR (225),
time TIMESTAMP DEFAULT now())"""
try:
cursor.execute(sql_table)
except:
print("数据库已创建")
finally:
# 删除数据库中全部数据
cursor.execute('truncate table ceshi')
conn.close()
def insert_mysql(value):
# 连接数据库
conn = pymysql.connect(host='127.0.0.1', port=3306, user='ceshi', passwd='123456',
db='django_one', charset='utf8')
# 建立游标对象
cursor = conn.cursor()
try:
cursor.execute("insert into ceshi (indexi,namei,start,releasetime,score) VALUES (%s,%s,%s,%s,%s)",(value['indexi'], value['namei'], value['start'],value['releasetime'],value['score']))
conn.commit()
print("插入成功")
except Exception as e:
print("插入失败 报错如下")
print("Error:(%s,%s,%s,%s,%s)" % (value['indexi'], value['namei'], value['start'],value['releasetime'],value['score']))
conn.close()
return value
if __name__ == '__main__':
mysql()
aa = main()
介绍PyQuery相关用法:
1.安装 pip install
2.操作
引入包 from pyquery import PyQuery as pq
doc = pq() #解析HTML字符串
doc('#container .list li') #定位CSS选择器的标签 多个中 空格隔开
find() 查找子节点 例如:find('li')
children() 指查找子节点
children('.active') #过滤
parent() #查找父节点
parents() #查找祖先节点
siblings() #查找兄弟节点
如果取全部 需要进行遍历 加items()
获取信息
attr()
a = doc('.item-0.active a')
print(a.attr('href')) == print(a.attr.href)
text() 获取文本 remove() 移除
html() 获取html标签
[https://pyquery.readthedocs.io/en/latest/traversing.html]API文档