使用PyQuery爬取猫眼电影 及PyQuery使用方法-实战篇

2019-06-25  本文已影响0人  Serven_Students

讲解:

涉及Python相关库:

from urllib.robotparserimport RobotFileParser
import requests
from pyqueryimport PyQuery
import pymysql

下面展示一段神奇的代码:

from urllib.robotparser import RobotFileParser
import requests
from pyquery import PyQuery
import pymysql

offert = 0
offert_tail = 100

def get_one_page(url):
    rb = RobotFileParser()
    rb.set_url(url)
    rb.read()
    if rb.can_fetch("*", url) == True:
        reponse = requests.get(url)
        if reponse.status_code == 200:
            one_page(reponse.text)

def one_page(html):
    doc = PyQuery(html)
    aa = doc('dd').items()
    item = {}
    for i in aa:
        indexi = i.children('i').text()
        namei = i.find('.name').text()
        start = i.find('.star').text()
        releasetime = i.find('.releasetime').text()
        score = i.find('.score').text()
        item['indexi'] = indexi
        item['namei'] = namei
        item['start'] = start
        item['releasetime'] = releasetime
        item['score'] = score
        write_r(str(item))
        insert_mysql(item)

def main():
    url = "https://maoyan.com/board/4?offset="
    for i in range(offert,offert_tail,10):
        urls = url + str(i)
        get_one_page(urls)

def write_r(neirong):
    with open('./text.txt','a',encoding='utf-8') as f:
        f.write(neirong + '\n')


def mysql():
    # 连接数据库
    conn = pymysql.connect(host='127.0.0.1', port=3306, user='ceshi', passwd='123456',
                                db='django_one', charset='utf8')
    # 建立游标对象
    cursor = conn.cursor()

    sql_table = """CREATE TABLE CESHI (
                id INT PRIMARY KEY AUTO_INCREMENT,
                indexi INT (12),
                namei VARCHAR (225),
                start VARCHAR (255),
                releasetime VARCHAR (225),
                score VARCHAR (225),
                time TIMESTAMP DEFAULT now())"""

    try:
        cursor.execute(sql_table)
    except:
        print("数据库已创建")
    finally:
        # 删除数据库中全部数据
        cursor.execute('truncate table ceshi')
    conn.close()

def insert_mysql(value):
    # 连接数据库
    conn = pymysql.connect(host='127.0.0.1', port=3306, user='ceshi', passwd='123456',
                           db='django_one', charset='utf8')
    # 建立游标对象
    cursor = conn.cursor()

    try:
        cursor.execute("insert into ceshi (indexi,namei,start,releasetime,score) VALUES (%s,%s,%s,%s,%s)",(value['indexi'], value['namei'], value['start'],value['releasetime'],value['score']))
        conn.commit()
        print("插入成功")
    except Exception as e:
        print("插入失败 报错如下")
        print("Error:(%s,%s,%s,%s,%s)" % (value['indexi'], value['namei'], value['start'],value['releasetime'],value['score']))
    conn.close()
    return value

if __name__ == '__main__':
    mysql()
    aa = main()
介绍PyQuery相关用法:
1.安装 pip install

2.操作

引入包 from pyquery import PyQuery as pq

doc = pq() #解析HTML字符串

doc('#container .list li') #定位CSS选择器的标签 多个中 空格隔开

find() 查找子节点 例如:find('li')

children() 指查找子节点

children('.active') #过滤

parent() #查找父节点

parents() #查找祖先节点

siblings() #查找兄弟节点

如果取全部 需要进行遍历 加items()

获取信息

attr()

a = doc('.item-0.active a')

print(a.attr('href')) == print(a.attr.href)

text() 获取文本 remove() 移除

html() 获取html标签

[https://pyquery.readthedocs.io/en/latest/traversing.html]API文档

联系方式QQ:

294402584

上一篇下一篇

猜你喜欢

热点阅读