Python-爬虫基础-Xpath-爬取百度搜索列表(获取标题和
2019-11-22 本文已影响0人
MonkeyLei
接着之前的MonkeyLei:Python-爬虫基础-Xpath-爬取百度风云榜旗下热点 继续搞。也是项目需要(需要通过百度搜索,获取搜索关键词的官方网站),,比如搜索今日头条,然后可以看到:
image当然,我实际搜索的是一些政策的网站,不是这个。。嘻嘻。。。为了快速使用,我直接用xPath分析了标题和链接的一些规则:
image链接我是通过id来匹配的,因为我的目的是一个标题对应上一个url,所以采用了一一对应的方式,如果不采用一一对应,那么可能会造成标题和url不是一套!!
image另外,由于如果获取百度加密链接的真实链接,按照一些方式可行,但是有失败情况!所以直接采用了selenium启动加载的页面,然后拿到current_url(这种方式会慢一些)
image这是偶尔失败版本的获取加密链接的方法(容易被远程返爬拒绝访问):
# 解決 'ascii' codec can't encode characters
if False:
s = quote(url_path, safe=string.printable)
try:
req = request.Request(s, None, head)
with request.urlopen(req) as uf:
return uf.geturl()
except Exception as err:
print('get_url', err)
return None
剩下的就是一些逻辑处理,输出完善等。。对了,静默启动Chrome chrome_options方式有所变化,这样启动会更快(减少了一些不必要的加载):
# 启动参数
chrome_options = Options()
prefs = {
'profile.default_content_setting_values': {
'images': 2, # 禁用图片的加载
# 'javascript': 2 # 禁用js,可能会导致通过js加载的互动数抓取失效
}
}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("window-size=1024,768")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('blink-settings=imagesEnabled=false')
最后基本上ok!**TODO **有个问题后面再完善和解决,着急忙其他事情....就是id对不上,因为获取标题和url,不是一个div下去实现的,这样就出现对应不上的问题??这个后面再搞吧~~~
***crawler_department.py -- ***selenium配置看之前的文章就行
# !/usr/bin/python3
# -*- coding: UTF-8 -*-
# 文件名: crawler_department.py
import os
import string
from urllib import request
from urllib.parse import quote
import ssl
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import UnexpectedAlertPresentException
# 伪装浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
}
# Cookie有点重要,不然访问要失败,不过也可以用selenium,省去了自己拼装的麻烦
bd_headers = {
'Cookie': 'BIDUPSID=8640A1C37FE0690CCFD0ADC95CDD0614; PSTM=1573012288; BAIDUID=8640A1C37FE0690C2FF67C0B307E1236:FG=1; BD_UPN=12314753; BDSFRCVID=cHFOJeC62xSAeNnwFmf5T97SHxCLPfRTH6aVosjQ3KdSxvaQuPVtEG0Pjx8g0KA-Nb29ogKKXgOTHw0F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR-tVCtatCI3HnRv5t8_5-LH-UoX-I62aKDs-Dt2BhcqEIL4hhLV3-4X5pjrWlcPMDnU5R5ctfJ8DUbSj4Qo5Pky-H3pQROhfnAJKRQH0q5nhMJN3j7JDMP0-xPfa5Oy523ihn3vQpnbhhQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0Djb-jN0qJ6FsKKJ03bk8KRREJt5kq4bohjnDjgc9BtQmJJrt2-T_5CQbflRmypo0bh-FBn8HJq4tQg-q3R7JJDTxEDO4jJQiWlTLQf5v0x-jLgbPVn0MW-5DSlI4qtnJyUPRbPnnBn-j3H8HL4nv2JcJbM5m3x6qLTKkQN3T-PKO5bRu_CFhJKIbhKLlejRjh-FSMgTK2Pc8bC_X3b7EfMjpsh7_bf--D6cLbpAe5JbqBTnK-4ceQhj1oMFGLpOxy5K_hP6x2U70WNOfLMcHbRclHDbHQT3mMRvbbN3i34jpWRuLWb3cWMnJ8UbS5T3PBTD02-nBat-OQ6npaJ5nJq5nhMJmb67JDMr0eGKJJ6LqJJ4HV-35b5raeR5g5DTjhPrM2RQAWMT-0bFH_---ahQofPcFLtTxej-9yMcU55cUJGn7_JjOWCOds-J2hU5hLnLW2b37BxQxtNRd2CnjtpvhHRnRbP5obUPUWMJ9LUvftgcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLtCthMI04ejt35n-Wqx5KhtvtK65tsJOOaCvjOhQOy4oTj6Db0PQ-Wt6f3Djh_x-XJMO1JhOs0-jC3MvB-Jjyb-TIt23bb-nKKxjhVMQmQft20-IbeMtjBM_LBDuHVR7jWhviep72ybt2QlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8IjH62btt_tJk8_CoP; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1427_21089_18560_29568_29220_28702; delPer=0; BD_CK_SAM=1; PSINO=7; COOKIE_SESSION=11616_0_9_9_7_46_0_3_9_6_8_20_261159_0_34_0_1574317407_0_1574317373%7C9%23334846_17_1574055214%7C4; BD_HOME=0; H_PS_645EC=a2613mtU9Z3zzlE3Z%2BGp%2Bj49ILi6lAP%2Fqx95Q%2FkEvc3CO5Lp9KZCsfjQvzU',
# 'Host': 'www.baidu.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
# 'X-Requested-With': 'XMLHttpRequest',
# 'Sec-Fetch-Site': 'same-origin',
# 'Sec-Fetch-Mode': 'cors',
}
# 之前做调试用的,现在用不到了
# bd_params = {
# 'cb': 'cb: jQuery11020025825831777796848_1574319324952',
# '_': '1574319324954',
# 'sid': '1427_21089_18560_29568_29220_28702',
# 'from': 'pc_web',
# 'req': '2'
# }
# 百度搜索页面
bd_search_url = 'https://www.baidu.com/s?ie=utf-8&wd='
chrome_options = None
# 或者加密的url真实的链接
def get_url(url_path, head=bd_headers):
if 'www' in url_path and url_path.startswith('www'):
return url_path
if 'baidu.com' not in url_path:
print('在吧', url_path)
return url_path
if 'http' not in url_path:
return None
# 解決 'ascii' codec can't encode characters
if False:
s = quote(url_path, safe=string.printable)
try:
req = request.Request(s, None, head)
with request.urlopen(req) as uf:
return uf.geturl()
except Exception as err:
print('get_url', err)
return None
if True:
s = quote(url_path, safe=string.printable)
# 采用selenium加载获取,解决访问被拒绝的问题
# 浏览器静默加载页面后等待10s(这个页面加载后后续还有很多js要执行,所以我们需要时间获取最终页面)
# wait = ui.WebDriverWait(browser, 10)
# 获取页面+包含诸多js
# 静默启动
# print('@hl 启动浏览器: ', url_path)
browser = webdriver.Chrome(chrome_options=chrome_options)
try:
browser.get(s)
# 如果网页里面能find_elements_by_xpath找个元素的话,直接返回无需再等待
# wait.until(lambda driver: browser.current_url)
last_url = browser.current_url
# print('@hl 实际url: ', last_url)
# 返回实际url
return last_url
except UnexpectedAlertPresentException:
browser.switch_to.alert.accept()
except Exception as err:
print('@hl 启动浏览器获取url: ', url_path)
print('↑↑↑', err)
finally:
# 退出浏览器进程
browser.quit()
return None
# 爬取页面内容
def spider(url_path, head=headers, code='gbk'):
data_html = ''
# 解決 'ascii' codec can't encode characters
s = quote(url_path, safe=string.printable)
req = request.Request(s, None, head)
try:
with request.urlopen(req) as uf:
while True:
data_temp = uf.read(1024)
if not data_temp:
break
# 编码并返回字符串类型
data_html += data_temp.decode(code, 'ignore')
except Exception as err:
print('spider--', err)
return data_html
# 解析获取区下的部门名称
def parse_text(html_data, rule):
if not html_data:
return None
try:
# 转换为html对象,以便进行path查找
html_obj = etree.HTML(html_data)
# 补全网页标签
last_html_data = etree.tostring(html_obj)
# 再次转换为html对象
html_obj = etree.HTML(last_html_data)
# 采集朝阳部门名称
depart_names = html_obj.xpath(rule)
keys = []
for item in depart_names:
# print(item.xpath('string(.)').strip())
# print(item)
# print(item.text)
# keys.append(item.text)
if hasattr(item, 'xpath'):
keys.append(item.xpath('string(.)').strip())
elif hasattr(item, 'text'):
keys.append(item.text)
else:
keys.append(item)
if len(keys) < 1:
return None
return keys
except Exception as err:
print('parse_text--', err)
return None
# 输出页面字符串到本地文件
def output(html_data, file_path, encod='gbk'):
# 文件存在,先删除
if os.path.exists(file_path):
os.remove(file_path)
f = None
try:
f = open(file_path, mode='w+', encoding=encod)
f.write(html_data)
except IOError as err:
print(err)
finally:
if not f:
f.close()
# 百度搜索分析官方
def bd_search(name):
# 再来个参数请求?
# parms = parse.urlencode(_bd_params)
# param_url = (bd_search_url + name + "%s") % parms
searchStr = spider(bd_search_url + name, bd_headers, 'utf-8')
if False:
print(searchStr)
# 输出到本地文件做简单分析
output(searchStr, 'test/bds2.html', 'utf-8')
get_all_titles = '//h3[@class="t"]/a[1]' # 获取所有的搜索结果
get_title = '//*[@id="1"]/h3/a' # 搜索结果标题
# 采用动态组合id的方式获取结果
get_href = '//div[@class="f13"]/a[1]/text()' # 这是不带http/https的域名
get_link = '//div[@class="f13"]/a[1]/@href' # 从百度搜索的这个是一个外链
# 采用:可以组合下面这两种方式获取url(不是外链)
get_by_id = '//*[@id="occupy_id"]/div[2]/a[1]' # 这是一个通过id获取第几个搜索结果
get_by_id_a = '//*[@id="occupy_id"]/div/div[2]/div[2]/a[1]' # 这是一个通过id获取第几个搜索结果
get_by_id_span = '//*[@id="occupy_id"]/div/div[2]/span[1]' # 这是一个通过id获取第几个搜索结果
# 获取所有标题
keyValues = parse_text(searchStr, get_all_titles)
# 获取标题对应的真实的链接
search_url = []
if keyValues:
for index in range(len(keyValues)):
# print(index, keyValues[index])
# 通过变化id获取url
# print(get_by_id.replace('occupy_id', str(index + 1)))
# urlRst = parse_text(searchStr, get_by_id.replace('occupy_id', str(index + 1)))
# if not urlRst:
# urlRst = parse_text(searchStr, get_by_id_second.replace('occupy_id', str(index + 1)))
print('搜索列表标题: ', keyValues[index], end=' ')
# 上面获得的整个是简写的,需要自己判断和拼接,采用和获取href的方式如何?
urlRst = parse_text(searchStr, get_by_id.replace('occupy_id', str(index + 1)) + '/@href')
if not urlRst:
# TODO 目前id对上有问题
# print('gggg', get_by_id_a.replace('occupy_id', str(index + 1)) + '/@href')
urlRst = parse_text(searchStr, get_by_id_a.replace('occupy_id', str(index + 1)) + '/@href')
if not urlRst:
urlRst = parse_text(searchStr, get_by_id_span.replace('occupy_id', str(index + 1)) + '/text()')
if urlRst:
for gv_url in urlRst:
# print(gv_url) # http://www.bjchy.gov.cn/ http://fgw.zgcy.gov.cn/ http://www.baidu.com/link?url=xxx
# print(len(urlRst))
# print(urlRst[0])
# print(get_url(urlRst[0]))
trip_str = gv_url.strip().replace(' ', '')
print(trip_str, end='\n')
real_url = get_url(trip_str)
if real_url:
search_url_dic = {'id': index, 'name': keyValues[index], 'linker': gv_url, 'url': real_url}
search_url.append(search_url_dic)
else:
print('\n', index, keyValues[index], '不是一个有效链接: ', gv_url, '\n')
else:
print('url: ', end='\n')
return search_url
if __name__ == '__main__':
# 线程池
executor = ThreadPoolExecutor(max_workers=64)
tasks = []
sequence = True
# 添加ssl认证
ssl._create_default_https_context = ssl._create_unverified_context
# 启动参数
chrome_options = Options()
prefs = {
'profile.default_content_setting_values': {
'images': 2, # 禁用图片的加载
# 'javascript': 2 # 禁用js,可能会导致通过js加载的互动数抓取失效
}
}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("window-size=1024,768")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('blink-settings=imagesEnabled=false')
# 这个地方我是获取了某个政策的政府部门的列表,然后开启多线程搞的
# url = 'http://www.xxx.gov.cn/'
# htmlStr = spider(url)
# parse_result = parse_text(htmlStr, '//*[@id="con_one_4"]/ul/li/a')
if False:
for searchkey in parse_result:
print(searchkey)
if True:
parse_result = ['今日头条']
for index in range(len(parse_result)):
task = executor.submit(bd_search, parse_result[index])
tasks.append(task)
print(index, '运行中....')
# 线程顺序执行
if sequence:
for future in as_completed(tasks):
search_urls = future.result()
for url_vl in search_urls:
# for k, v in url_vl.items():
# print(k, v)
print(url_vl['name'])
print(url_vl['url'], '\n')
tasks.clear()
# print('搜索关键字', parse_result[1])
# task = executor.submit(bd_search, ('朝阳区人力资源社会保障局'))
# tasks.append(task)
if not sequence:
for future in as_completed(tasks):
search_urls = future.result()
for url_vl in search_urls:
# for k, v in url_vl.items():
# print(k, v)
print(url_vl['name'])
print(url_vl['url'], '\n')
print('结束\n')
效果:(列表只针对url是链接的那种div标签,所以结果不会太多,而且大部分的官方网站基本都是这种类型,其他格式的列表项就可以不参与分析)
D:\PycharmProjects\python_study\venv3.x\Scripts\python.exe D:/MEME/fz/doc/Python/python_study/protest/crawler_department.py
0 运行中....
搜索列表标题: 今日头条 http://www.baidu.com/link?url=cnGjuxVRAfNVbHrYxU-gB6XNgCfyIJyp4gtXFcsYB2b1BRFSDow1vmwBKP89O-Y7
搜索列表标题: 今日头条成都公司面试经验(共11个真人分享) - 职朋职业圈 http://www.baidu.com/link?url=uAyIuHs3m_O9lEp3niWMPvdh54GNyMJGj5Q6JJIfzz0SaXOGeymOHW7HaD_GPlYTmm9JQEM1B4srLr_U-93tVa
搜索列表标题: 成都今日头条科技有限公司 url:
搜索列表标题: 头条号 - 你创作的,就是头条 http://www.baidu.com/link?url=fKFsNjXxoXh15pQQcAExS0QvASKkJSpsR_WwvR4QcIr06mEts7EbnB7suBzR5Cxt8kRuNRjhjHZjhn0HB8WcS6jb5o9dSDpgCU_e9IMhaEe
搜索列表标题: 今日头条APP下载,今日头条IOS客户端下载,安卓客户端下载 - 今日... url:
搜索列表标题: 今日头条下载_今日头条手机版下载_今日头条安卓版免费下载-太平洋... http://www.baidu.com/link?url=a2m3SbZqtO-4ZsbnOOtQjkgzfl-uUU0LD5-tZPPoKTnmx9-NzfHRPwDMLCCWVkmW
今日头条
https://www.toutiao.com/
今日头条成都公司面试经验(共11个真人分享) - 职朋职业圈
https://www.job592.com/pay/comms31938638.html
头条号 - 你创作的,就是头条
http://www.yuhuijob.com/showcom.php?c=5&shid=1498897056174
今日头条下载_今日头条手机版下载_今日头条安卓版免费下载-太平洋...
https://mp.toutiao.com/auth/page/login/?redirect_url=JTJG
结束
Process finished with exit code 0
目的还是在于学习,加深理解。。。另外也是.....喵喵喵