114.xpath的使用的案例及代码

2020-02-12  本文已影响0人  羽天驿

一.xpath的使用:

XML Path Language XML 路径语言
安装lxml库 (持HTML和XML解析,持XPath解析式)
pip install lxml

二. beautifulsoup的使用

pip3 install beautifulsoup4
解析器
Python 标准库 BeautifulSoup(html, “html.parser”) 速度般,容错能好
lxml HTML解析器 BeautifulSoup(html, “lxml”) 速度快,容错好
lxml xml解析器 BeautifulSoup(markup, “xml”) 速度快,唯持xml
html5lib BeautifulSoup(markup, “html5lib”) 容错性,速度慢
引⼊BeautifulSoup
from bs4 import BeautifulSoup
获取⽅法
soup = BeautifulSoup(html, "lxml") # 试lxml解析器构造beautifulsoup
print(soup.prettify()) # 取缩进格式化输出
print(soup.title.string) # 取title内容
print(soup.head)
print(soup.p)

获取节点的名字

print(soup.title.name)

获取节点属性

soup.img.attrs["src"]
print(soup.p.attrs)
print(soup.p.attrs["name"])
print(soup.p["class"])

获取节点包含的内容

print(soup.p.string)
<p class="c1"><span>asdf<span>asdfasdfasdfasdfadsfad<p>
嵌套选择
<head>
<title>this is title</title>
</head>

soup的节点都为 bs4.element.Tag类型,可以继续选择

print(soup.head.title.string)


关联选择
有些元素没有特征定位,可以先选择有办法定位的,然后以这个节点为准选择它的⼦节点、⽗
节点、兄弟节点等
<p class="p1"></p>
<p></p>
<p></p>
print(soup.p.contents) # 取p节点下所有节点列表
print(soup.p.descendants) #取p节点所有孙节点
print(soup.a.parent) # 取节点
print(soup.a.parents) # 取所有祖先节点
print(soup.a.next_sibling) # 同级下节点
print(soup.a.previous_sibling) # 同级上节点
print(soup.a.next_siblings) # 同级所有后节点
print(soup.a.previous_siblings) # 同级所有前节点
print(list(soup.a.parents)[0].attrs['class'])


方法选择器根据属性和本进查找
<ul><li><li><ul>
<ul><li><li>jjj<li><li></ul>
print(soup.find_all(name="ul"))
for ul in soup.find_all(name="ul"):
print(ul.find_all(name="li"))
for li in ul.find_all(name="li"):
print(li.string)
soup.find_all(attrs={"id": "list-1"})


css 选择器
<p id="p1" class="panel"><p class=""><p><p>
soup.select('.panel .panel_heading')
soup.select('ul li')
soup.select('#id1 .element')


三.爬虫爬取的基本流程

首先是:我们请求网页用的是requests库.

def get_page(page):
    url = 'https://maoyan.com/board/4?offset=%s' % str(page * 10)
    headers =  {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # response.content返回的是字节流,decode后变成字符串
        return response.content.decode('utf-8')
    return None
def main():
 get_page()

if __name__ == '__main__':
    main()

四.获取到网页我们需要进行解析:

(一)xpath解析网页的方法:
1.爬取豆瓣电影:

import requests
from lxml import etree

# 获取网页
def get_page():
    url = 'https://www.douban.com/group/explore'
    headers =  {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # response.content返回的是字节流,decode后变成字符串
        return response.content.decode('utf-8')
    return None

# 使用xpath解析网页
def parse_page(html):
    # 把html文本对象转换成etree的节点对象
    etree_html = etree.HTML(html)
    # print(etree_html)
    # print(type(etree_html))

    # // 表示匹配子子孙孙 * 表示任意类型节点 //* 子子孙孙任意类型节点
    # results = etree_html.xpath('//*')
    # print(results)
    # print(len(results))
    
    # 匹配所有的img标签
    # results = etree_html.xpath('//img')
    # print(results)
    # print(len(results))

    # 找出所有a标签里面的文字
    # results = etree_html.xpath('//a/text()')
    # print(results)

    # / 表示直接的儿子节点
    # results = etree_html.xpath('//div/h3/a/text()')
    # print(results)

    # 根据属性值来获取节点[@class="likes"]
    # results = etree_html.xpath('//div[@class="likes"]/text()')
    # print(results)

    # likes_list = []
    # for i in range(len(results)):
    #     if i % 2 == 0:
    #         likes_list.append(results[i])
    # print(likes_list)

    # 获取属性值用@attr
    # results = etree_html.xpath('//div[@class="pic"]/div[@class="pic-wrap"]/img/@src')
    # print(results)

    # 如果是一个属性里面包含多个值,但是你只知道一个值的时候 用contains
    # results = etree_html.xpath('//div[contains(@class, "grid-16-8")]//div[@class="likes"]/text()')
    # print(results)

    # results = etree_html.xpath('//div[@class="grid-16-8 clearfix"]//div[@class="likes"]/text()')
    # print(results)

    # contains这些方法 可以使用 and or这些条件
    # results = etree_html.xpath('//span[@class="pubtime" and contains(text(), "昨天")]/text()') 
    # print(results)

    # ..表示父亲节点
    # results = etree_html.xpath('//span[@class="pubtime" and contains(text(), "昨天")]/../../h3/a/text()') 
    # print(results)

    # 获取昨天16:00 到 18:00间的数据
    # results = etree_html.xpath('//span[@class="pubtime" and contains(text(), "昨天") and (starts-with(substring-after(text(),"昨天"), "16:") or starts-with(substring-after(text(),"昨天"), "17:"))]/text()') 
    # print(results)

    # 根据顺序号来取指定节点, 从1开始,不是从0开始
    # [1] [first()] [last()] [position() < 4]
    # 获取第2个标题
    # results = etree_html.xpath('//div[@class="channel-item"][2]/div/h3/a/text()')[0]
    # print(results)

    # 获取第3个到第5个标题
    # results = etree_html.xpath('//div[@class="channel-item"][position() >=3 and position() <= 5]/div/h3/a/text()')
    # print(results)

    # following::* 获取当前节点结束标签之后的所有节点
    # results = etree_html.xpath('//div[@class="channel-item"][2]/following::*')
    # print(results)

    # following-sibling::* 获取当前节点结束标签之后的所有同级节点
    results = etree_html.xpath('//div[@class="channel-item"][2]/following-sibling::*')
    print(results)
    print(len(results))



def main():
    html = get_page()
    # print(html)
    parse_page(html)

if __name__ == '__main__':
    main()

(二.)BeautifulSoup(靓汤解析网页的方法)
使用BeautifulSoup--爬取新浪体育的标题

import requests
from bs4 import BeautifulSoup

# 获取网页
def get_page():
    url = 'http://sports.sina.com.cn/nba/'
    headers =  {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # response.content返回的是字节流,decode后变成字符串
        return response.content.decode('utf-8')
    return None

def parse_page(html):
    # html = '<div><span>坚持努力</span></div>'
    soup = BeautifulSoup(html, 'lxml')
    # 整齐的格式输出,会做一些补全
    # print(soup.prettify())
    # print(soup.title.string)
    # print(soup.head)
    # print(soup.p)
    # print(soup.p.name) # 标签的类型名字
    # print(soup.img.attrs) # attrs获取节点的所有属性
    # print(soup.img.attrs['src'])
    # print(soup.p.contents) # 返回第一个p标签里面的所有节点列表
    # print(list(soup.a.parents))

    # 用css定位的方式获取节点 空格可以表示儿子或孙子(只要在下面就行了)
    # a_list = soup.select('.news-list-b .list a')
    # for item in a_list:
    #     print(item.string)

    # 当class里面有多个样式名,需要把空格去掉
    a_list = soup.select('div.-live-layout-container.row-fuild .news-list-b .list a')
    for item in a_list:
        print(item.string)


def main():
    html = get_page()
    # print(html)
    parse_page(html)

if __name__ == '__main__':
    main()

(三.)使用xpath和re爬取四川卫生网站的疫情。

import requests
import re
from lxml import etree

# 获取网页
def get_page(url):
    headers =  {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # response.content返回的是字节流,decode后变成字符串
        return response.content.decode('utf-8')
    return None

# 解析详情页
def parse_detail_page(html):
    etree_html = etree.HTML(html)
    result = etree_html.xpath('//div[@class="wy_contMain fontSt"]//span/text()')  
    # print(result)  
    # 把列表转换成一个大的字符串
    result_str = ''.join(result)
    
    # 获取数据时间
    titles = etree_html.xpath('//h1[@class="blue fontNb"]/text()')
    print(titles)

    # 获取每日新增病例数字
    pattern = re.compile('新增.*?确诊病例(\d+)例', re.S)
    xzs = re.findall(pattern, result_str)
    print(xzs)

# 使用xpath解析网页
def parse_page(html):
    # 把html文本对象转换成etree的节点对象
    etree_html = etree.HTML(html)
    items = etree_html.xpath('//div[@class="wy_contMain fontSt"]/ul/li/a[starts-with(text(), "截至")]')
    for item in items:
        link = item.xpath('./@href')[0]
        title = item.xpath('./text()')[0]
        print(link)
        print(title)
        full_link = 'http://wsjkw.sc.gov.cn' + link
        # 爬取详情页的信息
        detail_html = get_page(full_link)
        # 解析详情页信息
        parse_detail_page(detail_html) 

def main():
    url = 'http://wsjkw.sc.gov.cn/scwsjkw/gggs/tygl.shtml'
    html = get_page(url)
    # print(html)
    parse_page(html)

if __name__ == '__main__':
    main()

下面是连接数据库的操作

import pymysql

# 获取数据库连接
def get_connection():
    localhost = '127.0.0.1'
    port = 3306
    user = 'root'
    password = 'Vff12345678'
    database = 'maoyan'
    db = pymysql.connect(host, user, password, database, charset='utf8', port=port)
    return db

# 获取游标
def get_cursor(con):
    cursor = con.cursor()
    return cursor

# 关闭数据库连接
def close_connection(con):
    con.close()

# 插入数据
def save_db(con, cursor, data_dict):
    sql = 'insert into movie (title, releasetime, actor, ranks, score, cover) values ("%s", "%s", "%s", "%s", "%s", "%s")' % (data_dict['title'], data_dict['releasetime'], data_dict['actor'], data_dict['rank'], data_dict['score'], data_dict['cover'])
    print(sql)
    cursor.execute(sql)
    con.commit()

(四.)data数据的爬取(在XHR中)
爬取蘑菇街女装

import requests
import json

from day3.sqlachimy_hepler import*

# 获取网页
def get_page(page, action):
    # url = 'https://list.mogu.com/search?callback=jQuery21107629394841283899_1581471928849&_version=8193&ratio=3%3A4&cKey=15&page=' + str(page) + '&sort=pop&ad=0&fcid=50240&action=' + action + '&acm=3.mce.1_10_1ko4s.132244.0.9qYcxrQfkVICJ.pos_1-m_482170-sd_119&ptp=31.v5mL0b._head.0.ZS3jNSPn&_=1581471928851'
    url='https://list.mogu.com/search?&cKey=15&page='+str(page)+'&action='+action
    headers =  {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # response.content返回的是字节流,decode后变成字符串
        return response.content.decode('utf-8')
    return None

# 解析网页
def parse_page(html, action):
    # start_index = html.index('(')
    # html = html[start_index + 1:-2]
    # print(html)
    json_data = json.loads(html)
    # 获取是否结束的标记
    is_end = json_data['result']['wall']['isEnd']
    results = json_data['result']['wall']['docs']
    for item in results:
        # 放类别进去
        item['category'] = action
        print(item['title'])
        # 用sqlalchemy方式保存到数据库中
        save_goods(item)

    return is_end

# 获取所有的数据
def get_all_pages():
    # actions = ['clothing', 'skirt', 'trousers', 'shoes', 'bags', 'boyfriend', 'neiyi', 'baby', 'home']
    actions = ['neiyi']

    for action in actions:
        page = 1
        print(action)
        print('*' * 20)
        while True:
            print(page)
            html = get_page(page, action)
            is_end = parse_page(html, action)
            if is_end:
                break
            page += 1

def main():
    get_all_pages()

if __name__ == '__main__':
    main()
#建立数据库建表
create  database mogujie default character set=utf8;
use mogujie;
create table goods(
id integer primary key auto_increment,
title varchar(128),
link varchar(1024),
trade_item_id varchar(32),
org_price varchar(32),
price varchar(32),
sale varchar(32),
category varchar(128)
);
create index ix_goods _title on goods (title);
#使用sqlalchemy存储数据
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy import create_engine

engine = create_engine("mysql+pymysql://root:361394621@localhost/mogujie?charset=utf8", max_overflow=5,encoding='utf-8')

Base = declarative_base()
class Goods(Base):
    __tablename__ = 'goods'
    id = Column(Integer, primary_key=True, autoincrement=True)    #主键,自增
    trade_item_id = Column(String(32))
    title = Column(String(128))
    category = Column(String(128))
    link = Column(String(1024))
    org_price = Column(String(32))
    price = Column(String(32))
    sale = Column(String(32))
上一篇下一篇

猜你喜欢

热点阅读