AI

Python实战

2018-06-07  本文已影响9人  任嘉平生愿

本章学习代码收录在

GitHub - jiajia154569836/Python: python实战笔记

注意:

1.需要先安装python

2.需安装依赖例:python -m pip install requests

3.踩到的坑

win10,python3.5 安装scrapy - python菜鸟 - 博客园

scrapy写爬虫是出现no module named win32api错误 - 不活在梦想里 - 博客园

0.彩蛋

使用Python画小猪佩奇(Python内置的turtle库) - CSDN博客

1.爬取酷狗top500

设计方案:

1.根据requests获取html

2.根据BeautifulSoup解析html

3.找到需要查找的文本的class使用选择器

4.存入Mongo,time的设置是为了降低爬去的速度防止存入与爬取不对等

因为我没有安装Mongo其中部分代码注释(直接打印到控制台了)

代码如下:

import time

import requests

from bs4import BeautifulSoup

from pymongoimport MongoClient

#client = MongoClient()  # mongodb server

#songs = client.kugou_db.songs # song collection

headers= {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'

}

def get_info(url):

    '''获取酷狗音乐TOP500信息'''

    wb_data= requests.get(url, headers=headers)

    soup= BeautifulSoup(wb_data.text, 'lxml')

    ranks= soup.select('.pc_temp_num')  # 排名list

    titles= soup.select('.pc_temp_songlist > ul > li > a')  # 名称list

    song_times= soup.select('.pc_temp_time')  # 歌曲时长list

    for rank, title, song_timein zip(ranks, titles, song_times):

        data= {

            'rank': rank.get_text().strip(),

            'singer': title.get_text().split('-')[0].strip(),

            'song': title.get_text().split('-')[1].strip(),

            'time': song_time.get_text().strip()

}

        print(data)

        # song_id = songs.insert(data) # insert db

#print(song_id)

        print('---------------------------------')

if __name__== '__main__':

    # 生成需要遍历的url

    urls= ['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for iin range(1, 24)]

    for urlin urls:

        get_info(url)

        time.sleep(1)

2.爬取拉勾网招聘信息


import json

import math

import time

import pymongo

import requests

#client = pymongo.MongoClient('localhost',27017)

#mydb = client['mydb']

#lagou = mydb['lagou']

headers= {

    'Accept': 'application/json, text/javascript, */*; q=0.01',

    'Accept-Encoding': 'gzip, deflate, br',

    'Accept-Language': 'zh-CN,zh;q=0.8',

    'Connection': 'keep-alive',

    'Content-Length': '26',

    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',

    'Cookie': 'xxxxxxxxxxxxxxxxx',

    'Host': 'www.lagou.com',

    'Origin': 'https://www.lagou.com',

    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    'X-Anit-Forge-Code': '0',

    'X-Anit-Forge-Token': 'None',

    'X-Requested-With': 'XMLHttpRequest'

}

def get_page(url,params):

    html= requests.post(url,data=params,headers=headers)

    json_data= json.loads(html.text)

    total_count= json_data['content']['positionResult']['totalCount']

    page_number= math.ceil(total_count/15) if math.ceil(total_count/15)<30 else 30

    get_info(url,page_number)

def get_info(url, page):

    for pnin range(1,page+1):

        params= {

            'first':'false',

            'pn':str(pn),

            'kd':'Python'

        }

        try:

            html= requests.post(url,data=params,headers=headers)

            json_data= json.loads(html.text)

            results= json_data['content']['positionResult']['result']

            for resultin results:

                infos= {

                    'businessZones': result['businessZones'],

                    'city': result['city'],

                    'companyFullName': result['companyFullName'],

                    'companyLabelList': result['companyLabelList'],

                    'companySize': result['companySize'],

                    'district': result['district'],

                    'education': result['education'],

                    'explain': result['explain'],

                    'financeStage': result['financeStage'],

                    'firstType': result['firstType'],

                    'formatCreateTime': result['formatCreateTime'],

                    'gradeDescription': result['gradeDescription'],

                    'imState': result['imState'],

                    'industryField': result['industryField'],

                    'jobNature': result['jobNature'],

                    'positionAdvantage': result['positionAdvantage'],

                    'salary': result['salary'],

                    'secondType': result['secondType'],

                    'workYear': result['workYear']

}

                print('------------------')

                print(infos)

            #    lagou.insert_one(infos)

            time.sleep(2)

        except requests.exceptions.ConnectionError:

            pass

if __name__== "__main__":

    url= 'https://www.lagou.com/jobs/positionAjax.json'

    params= {

        'first':'true',

        'pn':'1',

        'kd':'python'

    }

    get_page(url,params)

3.爬取淘宝商品信息

4.利用scrap爬虫抓取小猪短租网

scrapy startproject new

scrapy crawl new

代码查看git项目(文章顶部)

上一篇下一篇

猜你喜欢

热点阅读