Python爬取招聘

2018-12-17  本文已影响0人  开心的小哈
from selenium import webdriver
import re


def geturl(urlname):
    url="https://search.51job.com/list/010000,000000,0000,00,9,99,"+urlname+",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    brower = webdriver.Firefox()
    brower.get(url)
    pagesoures=brower.page_source#抓取网页源代码
    restr="共(\\d+)条"#如果不带括号会输出全部,只要()内的数据如果政策抓取不到也许他前面有空格
    rex=re.compile(restr,re.IGNORECASE)
    mylist=rex.findall(pagesoures)
    brower.close()
    news=mylist[0].strip()#去掉前后空格
    if len(mylist)==0:
        return "失败"
    else:
        return news

# print(geturl("python"))
androids=["android开发","安卓开发","Android","Android实习生","软件测试"]
for ands in androids:
    print(ands,geturl(ands))

urllib.request

import urllib.request
import urllib.error
def download(url):
    response=urllib.request.urlopen(url,timeout=5)#timeout访问超时
    print(type(response))#<class 'http.client.HTTPResponse'>
    print(response.info())#包含了网站请求的详细信息
    print(response.read())#读取原代码,可以传入几个直接如100等
try:
   print(download("http://www.google.com"))
except urllib.error.URLError as e:#抓住错误变量类型当作变量
   print("网络异常",e)

伪装

import urllib.request


def openUrl(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/51.0.2704.63 Safari/537.36'}#伪装浏览器(有的网站会限制此时就要用伪装)
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)  # 请求
    html = response.read()  # 获取
    html = html.decode("utf-8")  # 解码
    print(html)  # 打印

def openUrl2(url):
    return urllib.request.urlopen(url).read()#读取全部网页

if __name__ == "__main__":
    url = "http://www.bazhuayu.com/download"  # 'http://www.douban.com/'
    print(openUrl2(url))
    openUrl(url)
20180715201515845.png

还可以冒充手机浏览器等
如果网站把我们屏蔽了可以通过伪装,调用浏览器2中解决
有的浏览网址需要传入中文比如百度....所以在这里我们要进行编码和解码

import urllib.request

url = "http://zzk.cnblogs.com/s?w=python"+ urllib.parse.quote("爬虫")+"&t=b"

print(urllib.parse.quote("爬虫"))#统一规范 编码
print(urllib.request.unquote(urllib.parse.quote("爬虫")))#解码
print(urllib.request.urlopen(url).read().decode("UTF-8"))#打印编码
上一篇 下一篇

猜你喜欢

热点阅读