爬虫实战(1)--广州法律法规信息抓取

2019-05-22  本文已影响0人  周周周__

本文网站会封ip,作者用的是固定ip

# -*- coding: utf-8 -*-
'''
Time    : 2019/5/22 10:02
Author  : zhouzhou
Email   : 1085089422@qq.com
File    : guang_zhou_law.py
Software: PyCharm
url     : http://www.gz.gov.cn/gzgov/s2792/gk_fggw_list2.shtml
database: law/guang_zhou_law
'''

import requests
from fake_useragent import UserAgent
ua = UserAgent()
import re
from lxml import etree
import psycopg2
import time

def get_list(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400',
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }
    proxies = {
        "http": "http:c9zrdbya0q:diwbjxksqt@122.114.166.184:23128"
    }


    print((requests.get(url='http://www.icanhazip.com/', headers=headers, proxies=proxies)).text)
    response = requests.get(url, headers=headers)
    response.encoding = 'utf8'
    # print(response.text)
    html = etree.HTML(response.text)
    hrefs = html.xpath('//ul[@class="news_list"]/li/a/@href')
    # print(hrefs)
    for href in hrefs:
        get_href(href)
        # break


def get_href(href):
    if '../../' in href:
        url = 'http://www.gz.gov.cn/' + href.replace('../../', '')
    else:
        url = href
    print("~"*200)
    print(url)
    get_con(url)


def get_con(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400',
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }
    proxies = {
        "http": "http:c9zrdbya0q:diwbjxksqt@122.114.166.184:23128"
    }

    time.sleep(2)
    response = requests.get(url, headers=headers, proxies=proxies)
    response.encoding = 'utf8'
    # print(response.text)
    html = etree.HTML(response.text)
    try:
        title = html.xpath('//h1[@class="content_title"]/text()')
        print(title)
        title = title[0].strip()
    except:
        title = html.xpath('//h1[@class="info_title"]/text()')[0].strip()
    print(title)
    try:
        wen_hao = re.findall('([穗府|厅外字].*?号)', response.text)[0]
    except:
        wen_hao = re.findall('(第\d+号)', response.text)
        print(bool(wen_hao))
        if bool(wen_hao) is False:
            pass
        else:
            wen_hao = '广州市人民政府令' + wen_hao[0]
    print(wen_hao)
    data = re.findall('>{0,1}(\S{1,4}年\S{1,2}月.{1,3}日)', response.text)[0]
    print(data)

    con = html.xpath('//div[@class="mainbox_bg content clearfix"]//text()')
    if con == []:
        # print("初期筛选为空")
        con = html.xpath('//div[@class="info_cont"]//text()')
    con = ''.join(con)
    print(con.encode('GBK', 'ignore').decode('GBk'))
    save(title, wen_hao, data, con)


def save(title, wen_hao, data, con):
    conn = psycopg2.connect(database='law', user='postgres', password='123456', host='127.0.0.1', port='5432')
    cur = conn.cursor()
    sql = 'insert into guang_zhou_law(title, data_time,wen_hao, content1)values (%s, %s, %s, %s)'
    try:
        cur.execute(sql, (title, data, wen_hao, con))
    except Exception as e:
        print('数据库插入错误:', e)
        return False
    conn.commit()


if __name__ == "__main__":
    for i in range(1, 67):
        print("iiiiiiii", i)
        url = 'http://www.gz.gov.cn/gzgov/s2792/gk_fggw_list2_{}.shtml'.format(i)
        get_list(url)
        # break
上一篇下一篇

猜你喜欢

热点阅读