爬虫实战(1)--广州法律法规信息抓取
2019-05-22 本文已影响0人
周周周__
本文网站会封ip,作者用的是固定ip
# -*- coding: utf-8 -*-
'''
Time : 2019/5/22 10:02
Author : zhouzhou
Email : 1085089422@qq.com
File : guang_zhou_law.py
Software: PyCharm
url : http://www.gz.gov.cn/gzgov/s2792/gk_fggw_list2.shtml
database: law/guang_zhou_law
'''
import requests
from fake_useragent import UserAgent
ua = UserAgent()
import re
from lxml import etree
import psycopg2
import time
def get_list(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400',
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
proxies = {
"http": "http:c9zrdbya0q:diwbjxksqt@122.114.166.184:23128"
}
print((requests.get(url='http://www.icanhazip.com/', headers=headers, proxies=proxies)).text)
response = requests.get(url, headers=headers)
response.encoding = 'utf8'
# print(response.text)
html = etree.HTML(response.text)
hrefs = html.xpath('//ul[@class="news_list"]/li/a/@href')
# print(hrefs)
for href in hrefs:
get_href(href)
# break
def get_href(href):
if '../../' in href:
url = 'http://www.gz.gov.cn/' + href.replace('../../', '')
else:
url = href
print("~"*200)
print(url)
get_con(url)
def get_con(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400',
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
proxies = {
"http": "http:c9zrdbya0q:diwbjxksqt@122.114.166.184:23128"
}
time.sleep(2)
response = requests.get(url, headers=headers, proxies=proxies)
response.encoding = 'utf8'
# print(response.text)
html = etree.HTML(response.text)
try:
title = html.xpath('//h1[@class="content_title"]/text()')
print(title)
title = title[0].strip()
except:
title = html.xpath('//h1[@class="info_title"]/text()')[0].strip()
print(title)
try:
wen_hao = re.findall('([穗府|厅外字].*?号)', response.text)[0]
except:
wen_hao = re.findall('(第\d+号)', response.text)
print(bool(wen_hao))
if bool(wen_hao) is False:
pass
else:
wen_hao = '广州市人民政府令' + wen_hao[0]
print(wen_hao)
data = re.findall('>{0,1}(\S{1,4}年\S{1,2}月.{1,3}日)', response.text)[0]
print(data)
con = html.xpath('//div[@class="mainbox_bg content clearfix"]//text()')
if con == []:
# print("初期筛选为空")
con = html.xpath('//div[@class="info_cont"]//text()')
con = ''.join(con)
print(con.encode('GBK', 'ignore').decode('GBk'))
save(title, wen_hao, data, con)
def save(title, wen_hao, data, con):
conn = psycopg2.connect(database='law', user='postgres', password='123456', host='127.0.0.1', port='5432')
cur = conn.cursor()
sql = 'insert into guang_zhou_law(title, data_time,wen_hao, content1)values (%s, %s, %s, %s)'
try:
cur.execute(sql, (title, data, wen_hao, con))
except Exception as e:
print('数据库插入错误:', e)
return False
conn.commit()
if __name__ == "__main__":
for i in range(1, 67):
print("iiiiiiii", i)
url = 'http://www.gz.gov.cn/gzgov/s2792/gk_fggw_list2_{}.shtml'.format(i)
get_list(url)
# break