爬虫boss直聘

2019-01-25 本文已影响0人 christinazou

from lxml import etree
import requests
import time
import csv

首先在网页中找到需要的信息
首先，登录boss直聘网，搜索"数据分析"。打开网页检查器并刷新，找到网页的标头也就是header信息。

req_url根据页数有相应的变化，headers 把网页中的信息直接拷贝下来就可以了。得到request 返回的结果

 def gethtml(i):
    # requests 相关数据
    req_url =( 'https://www.zhipin.com/c100010000/?query=数据分析&page=%s&ka=page-%s' %(i,i))
    headers = {
    "Cookie": "Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1548331018; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1547734371,1547971749,1548077779,1548208490; __a=4592978.1541328694.1548077779.1548208490.873.10.12.457; __l=l=%2Fwww.zhipin.com%2F&r=; JSESSIONID=""; toUrl=http%3A%2F%2Fwww.zhipin.com%2Fc100010000%2F%3Fquery%3D%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%26ka%3Dsel-city-100010000; lastCity=101010100; __c=1548208490; __g=-; _uab_collina=154169007228349597439633",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Host": "www.zhipin.com",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15",
    "Accept-Language": "zh-cn",
    "Accept-Encoding": "br, gzip, deflate",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Connection": "keep-alive"
}

用xpath解析html需要先将html保存在etree中

#返回职位的dom集
def getcompanys(req_result): 
    dom = etree.HTML(req_result.text)
    company_path = "//div[@class='job-primary']"
    companys = dom.xpath(company_path)
    return companys

#返回职位detail字典
def getdetails(companys): 
        #使用 xpath 的到字段
        positionName = c.xpath(".//h3[@class='name']//div[@class='job-title']/text()")[0]
        salary = c.xpath(".//h3[@class='name']//span/text()")[0]
        city = c.xpath(".//p/text()")[0]
        experience = c.xpath(".//p/text()")[1]
        education = c.xpath(".//p/text()")[2]
        companyShortName = c.xpath(".//div[@class='info-company']//a/text()")[0]
        industryField = c.xpath(".//div[@class='info-company']//p/text()")[0]
        financeStage= c.xpath(".//div[@class='info-company']//p/text()")[1]
        companySize= c.xpath(".//div[@class='info-company']//p/text()")[2]
        positionId = c.xpath(".//div[@class='info-primary']//a/@data-jobid")
        
        details = {
        'positionId':positionId,
        'positionName': positionName,
        'salary': salary,
        'city': city,
        'experience': experience,
        'education':  education,   
        'companyShortName': companyShortName,
        'industryField': industryField,
        'financeStage': financeStage,
        'companySize': companySize
    }
        return details

保存在csv文件中

#保存csv文件
with open('data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['positionId','positionName', 'salary','city','experience', 'education','companyShortName','industryField','financeStage','companySize']
    csvobj = csv.DictWriter(csvfile, fieldnames=fieldnames)
    csvobj.writeheader()
   
    for i in range(1,10):
        req_result = gethtml(i)
        companys = getcompanys(req_result)
        for c in companys:
            details = getdetails(c)
            #写入csv文件
            csvobj.writerow(details)

得到的文件：

爬虫boss直聘

猜你喜欢

热点阅读