爬虫:爬取58同城

2016-08-04  本文已影响0人  泠泠七弦客

本来想爬取一些由js加载的数据(浏览人数),但是这个页面改版了,浏览人数直接呈现的。并且成色,发布日期两项都没有了,所以本次爬虫基本还是之前的套路。

结果
from bs4 import BeautifulSoup
import requests
import json
import time

User_Agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
headers = {
    'User-Agent': User_Agent,
}


# 发帖时间,成色没有
def request_details(url):
   f = requests.get(url, headers=headers)
    soup = BeautifulSoup(f.text, 'lxml')
    category = soup.select('div.breadCrumb.f12 > span:nth-of-type(4)')
    title = soup.select('h1.info_titile')
    price = soup.select('span.price_now > i')
    area = soup.select('div.palce_li > span > i')
    view_num = soup.select('span.look_time')

    data = {
        'title': title[0].text,
        'category': category[0].text.strip(),
        'price': price[0].text,
        'area': area[0].text,
        'view_num': view_num[0].text
    }
    return data
   

def get_link(url):
    f = requests.get(url)
    soup = BeautifulSoup(f.text, 'lxml')
    links = soup.select('zzinfo > td.img > a')
    link_list = []
    for link in links:
        link_content = link.get('href')
        link_list.append(link_content)
    return link_list


def save_to_text(content):
    content = json.dumps(content, ensure_ascii=False)
    with open('58', 'a', encoding='utf-8') as f:
        f.write(content)
        f.write('\r\n')


def main():
    link = 'http://bj.58.com/pbdn/0/pn{}'
    start = 1
    end = 10
    urls = [link.format(i) for i in range(start, end)]
    for url in urls:
        link_list = get_link(url)
        for link in link_list:
            content = request_details(link)
            time.sleep(1)
            print(content)
            save_to_text(content)


if __name__ == '__main__':
    main()

这个跟之前爬取的小猪短租别无二致,所以需要总结的不多,说说其中遇到的一些问题吧!

def save_to_text(content):
    content = json.dumps(content, ensure_ascii=False)
    with open('58', 'a', encoding='utf-8') as f:
        f.write(content)

以上

上一篇 下一篇

猜你喜欢

热点阅读