正则表达式爬取网页内容
2019-05-12 本文已影响0人
haokeed
基本方法
import re
# search 查找
string="pythonpython"
pat='th'
r1=re.search(pat,string)
print(r1)
# match 与第一个进行匹配
pat2='th'
r2=re.match(pat2,string)
print(r2)
pat3='py'
r3=re.match(pat3,string)
print(r3)
# findall 全局匹配函数
pat4='th'
r4=re.findall(pat4,string) # 所有匹配到的都返回
print(r4)
pat='py(.*)on' # 贪婪模式
r=re.findall(pat,string)
print(r)
pat='py(.*?)on' # 非贪婪模式
r=re.findall(pat,string)
print(r)
image.png
image.png
模式修正符
# 模式修正符
# I 匹配时忽略大小写
# S 匹配换行符
# M 多行匹配
string='Python'
pat='pyt'
r=re.findall(pat,string,re.I)
print(r)
string='''Python
baidu
'''
pat='t.*d'
r=re.findall(pat,string,re.S)
print(r)
image.png
51 job 爬虫 用正则表达式提取网页信息
import re
import requests
# 获取网页数据
url="https://search.51job.com/list/010000,000000,0000,00,9,99,Java%2520%25E5%25BC%2580%25E5%258F%2591,2,1.html"
res=requests.get(url)
res.encoding='gbk'
# 职位
pat='<a target="_blank" title="(.*?)" href="(?:.*?)" onmousedown="(?:.*)">'
position=re.findall(pat,res.text)
print(position)
print(len(position))
# 公司
pat='<span class="t2"><a target="_blank" title="(.*)" href=".*">.*</a></span>'
company=re.findall(pat,res.text)
print(company)
print(len(company))
# 地址
# pat='<span class="t3">(.*)</span>'
pat='<div class="el">(?:.*?)<span class="t3">(.*?)</span>'
addrs=re.findall(pat,res.text,re.S) # 采用模式修符
print(addrs)
print(len(addrs))
# 工资
pat='<div class="el">(?:.*?)<span class="t4">(.*?)</span>'
salary=re.findall(pat,res.text,re.S) # 采用模式修正符
print(salary)
print(len(salary))
from pandas import DataFrame
jobinfo=DataFrame([position,company,addrs,salary]).T
jobinfo.columns=["职位","公司","地址","工资"]
jobinfo
image.png
image.png
image.png
爬取多页信息
# 多页爬取信息
import re
import requests
from pandas import DataFrame
import pandas as pd
jobinfoAll=DataFrame()
for i in range(1,10):
# 获取网页数据
url="https://search.51job.com/list/010000,000000,0000,00,9,99,Java%2520%25E5%25BC%2580%25E5%258F%2591,2,"+str(i)+".html"
res=requests.get(url)
res.encoding='gbk'
# 职位
pat='<a target="_blank" title="(.*?)" href="(?:.*?)" onmousedown="(?:.*)">'
position=re.findall(pat,res.text)
# 公司
pat='<span class="t2"><a target="_blank" title="(.*)" href=".*">.*</a></span>'
company=re.findall(pat,res.text)
# 地址
# pat='<span class="t3">(.*)</span>'
pat='<div class="el">(?:.*?)<span class="t3">(.*?)</span>'
addrs=re.findall(pat,res.text,re.S) # 采用模式修符
# 工资
pat='<div class="el">(?:.*?)<span class="t4">(.*?)</span>'
salary=re.findall(pat,res.text,re.S) # 采用模式修正符
jobinfo=DataFrame([position,company,addrs,salary]).T
jobinfo.columns=["职位","公司","地址","工资"]
jobinfoAll=pd.concat([jobinfoAll,jobinfo])
jobinfoAll
image.png
image.png