爬虫实战(4)--法律图书馆信息抓取

2019-05-22  本文已影响0人  周周周__

本文主要拿的是法律法规

import requests
import re
from urllib.parse import quote
from lxml import etree
import chardet
import psycopg2
import time
import hashlib
from fake_useragent import UserAgent
ua = UserAgent()
headers = {
    'User_Agent': ua.random
}

def get(url):
    print("开始请求界面")
    response = requests.get(url=url, headers=headers)
    time.sleep(3)
    encoding = chardet.detect(response.content)
    res = response.content.decode(encoding['encoding'], 'ignore')
    return res


def clear1(res, i):
    print('开始清洗第{}页'.format(i))
    html = etree.HTML(res)
    hrefs = html.xpath('//ul[@class="line2"]/li/a/@href')
    next = html.xpath('//span[contains(text(),"第{}页")]/text()'.format(i))   # 用此进行判断,如果下一页没有链接属性,就停止循环
    # print(next)
    # print(type(next))
    print(hrefs)
    return hrefs, next


def clear2(res, url):
    html = etree.HTML(res)
    pub_time = re.findall('<li>【颁布时间】(\d{4}-\d{1,2}-\d{1,2})</li>', res)
    title = re.findall('<li>【标题】(.*?)</li>', res)
    wen_hao = re.findall('<li>【发文号】(.*?)</li>', res)
    lose_time = re.findall('<li>【失效时间】(.*?)</li>', res)
    pub_ora = re.findall('<li>【颁布单位】(.*?)</li>', res)
    sour = re.findall('【法规来源】(h.*?)<', res)
    content = ''.join((html.xpath('//div[@class="viewcontent"]//text()'))[9:])
    content1 = re.sub('不分页显示   总共2页  1 \[2\]   下一页', '', content)
    content2 = ''
    next = html.xpath('//a[contains(text(),"下一页")]/@href')
    # print(next)
    if next != []:
        print("进入下一页")
        url = 'http://www.law-lib.com/law/law_view.asp' + next[0]
        print(url)
        res2 = get(url)
        html2 = etree.HTML(res2)
        content3 = html2.xpath('//div[@class="viewcontent"]//text()')
        for data in content3:
            if re.findall('第\S+条', data):
                index1 = content3.index(data)
                print(index1)
                break
        # print(content3)
        try:
            content2 = ''.join((html2.xpath('//div[@class="viewcontent"]//text()'))[index1:])
        except:
            pass
        # print('2222', content2)

    url = url
    content = content1 + content2
    content = re.sub('不分页显示   总共2页  \[1\] 2 上一页 ', '', content)
    print(url)

    print(title[0])
    print(pub_time[0])
    print(wen_hao[0])
    print(lose_time[0])
    print(pub_ora[0])
    print(url)
    if sour == []:
        sour = ['']
    # print(sour)
    # print(type(sour))
    print(content)
    sign = hashlib.md5((content).encode('utf-8')).hexdigest()
    print(sign)
    return title[0], wen_hao[0], lose_time[0], pub_time[0], pub_ora[0], sour[0], content, url, sign


def save(data):
    conn = psycopg2.connect(database='falv_wenku', user="postgres", password='123456',
                            host='127.0.0.1', port='5432')
    try:
        cur = conn.cursor()
        sql = "insert into fa_lv_lib(title, wen_hao, lose_time, pub_time, pub_ora, sour, conten, url, sign)values" \
              "(%s, %s, %s, %s, %s, %s,%s, %s, %s)"
        cur.execute(sql, (data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]))
        conn.commit()
    except Exception as e:
        print("数据库错误:", e)


if __name__ == "__main__":
    for key in ['最高人民法院', '最高人民检察院', '国务院', '国务院办公厅']:
        i = 3
        if key == '全国人民代表大会':
            i = 49
        while True:
            i = i + 1
            url = 'http://www.law-lib.com/law/lawml.asp?bbdw={}&pages={}'.format(quote(key.encode('GBk')), i)

            # data = {}
            res = get(url)
            hrefs = clear1(res, i)
            if hrefs[1] == []:  # 如果没有下一页,结束循环
                break
            for href in hrefs[0]:
                url = 'http://www.law-lib.com/law/'+href
                # url = 'http://www.law-lib.com/law/law_view.asp?id=523891'
                res = get(url)
                data = clear2(res, url)
                save(data)
                # break

上一篇下一篇

猜你喜欢

热点阅读