BeautifulSoup4爬虫练习

2017-04-17 本文已影响0人 Yuu_CX

爬豆瓣读书，不知道是不是反爬虫，只能到50页。。后面直接封ip了

import pymysql
import requests
from bs4 import BeautifulSoup
import time


#%d用作数字占位
baseUrl = "https://book.douban.com/tag/日本文学?start=%d&type=T"   
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

def get_books(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ul", "subject-list").find_all("li")# 获取所有的书本内容
    for i in items:
        books = {}      # 临时存取电影的数据
        books["name"] = i.find("div", "info").a['title']   # 书本名字
        books["pub"] = i.find("div", "info").find("div", "pub").text # 出版信息
        books["score"] = i.find("span", "rating_nums").text if(i.find("span", "rating_nums")) else ""  # 评分
        books["comment_num"] = i.find("span", "pl").text # 评论人数
        books["detail"] = i.find("p", "").text if(i.find("p", "")) else ""# 书本详情
        books["link"] = i.find("div","pic").find("a").get("href")   # 书本详情页链接
        books["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 书本海报地址
        lists.append(books) # 保存到返回数组中
    return lists

if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        pub VARCHAR(100) NOT NULL,
        score VARCHAR(100) NOT NULL,
        comment_num VARCHAR(100) NOT NULL,
        detail VARCHAR(300) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,1000,20):
        lists = get_books(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句，%s用作字符串占位
            sql = "INSERT INTO `douban_books`(`name`,`pub`,`score`,`comment_num`,`detail`,`link`,`poster`) VALUES(%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["pub"], i["score"], i["comment_num"], i["detail"], i["link"], i["poster"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
        time.sleep(0.5)
    db.close()

将结果导入MySQL

重点来了：
改进方法，安全的爬了四万五千条豆瓣读书数据

import pymysql
import requests
from bs4 import BeautifulSoup
import time


headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

def get_books(start):
    #%d用作数字占位
#==============================================================================
#     baseUrl = "https://book.douban.com/tag/旅行?start=%d&type=T"
#==============================================================================
    url_list = ["https://book.douban.com/tag/绘本?start=%d&type=T","https://book.douban.com/tag/推理?start=%d&type=T","https://book.douban.com/tag/青春?start=%d&type=T","https://book.douban.com/tag/言情?start=%d&type=T","https://book.douban.com/tag/科幻?start=%d&type=T","https://book.douban.com/tag/武侠?start=%d&type=T","https://book.douban.com/tag/奇幻?start=%d&type=T","https://book.douban.com/tag/随笔?start=%d&type=T","https://book.douban.com/tag/散文?start=%d&type=T","https://book.douban.com/tag/诗歌?start=%d&type=T","https://book.douban.com/tag/童话?start=%d&type=T",
    "https://book.douban.com/tag/名著?start=%d&type=T","https://book.douban.com/tag/小说?start=%d&type=T","https://book.douban.com/tag/港台?start=%d&type=T","https://book.douban.com/tag/漫画?start=%d&type=T","https://book.douban.com/tag/历史?start=%d&type=T","https://book.douban.com/tag/哲学?start=%d&type=T","https://book.douban.com/tag/传记?start=%d&type=T","https://book.douban.com/tag/设计?start=%d&type=T","https://book.douban.com/tag/建筑?start=%d&type=T","https://book.douban.com/tag/电影?start=%d&type=T","https://book.douban.com/tag/回忆录?start=%d&type=T","https://book.douban.com/tag/音乐?start=%d&type=T","https://book.douban.com/tag/日本文学?start=%d&type=T",
    "https://book.douban.com/tag/旅行?start=%d&type=T","https://book.douban.com/tag/励志?start=%d&type=T","https://book.douban.com/tag/职场?start=%d&type=T","https://book.douban.com/tag/美食?start=%d&type=T","https://book.douban.com/tag/教育?start=%d&type=T","https://book.douban.com/tag/灵修?start=%d&type=T","https://book.douban.com/tag/健康?start=%d&type=T","https://book.douban.com/tag/家居?start=%d&type=T","https://book.douban.com/tag/经济学?start=%d&type=T","https://book.douban.com/tag/管理?start=%d&type=T","https://book.douban.com/tag/商业?start=%d&type=T","https://book.douban.com/tag/金融?start=%d&type=T","https://book.douban.com/tag/营销?start=%d&type=T",
    "https://book.douban.com/tag/理财?start=%d&type=T","https://book.douban.com/tag/股票?start=%d&type=T","https://book.douban.com/tag/企业史?start=%d&type=T","https://book.douban.com/tag/科普?start=%d&type=T","https://book.douban.com/tag/互联网?start=%d&type=T","https://book.douban.com/tag/编程?start=%d&type=T","https://book.douban.com/tag/交互设计?start=%d&type=T","https://book.douban.com/tag/算法?start=%d&type=T","https://book.douban.com/tag/通信?start=%d&type=T","https://book.douban.com/tag/神经网络?start=%d&type=T"] 
    lists = []
    for baseUrl in url_list:
         url = baseUrl % start
         html = requests.get(url,headers=headers)
         soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
         items = soup.find("ul", "subject-list").find_all("li")# 获取所有的书本内容
         for i in items:
             books = {}      # 临时存取电影的数据
             books["name"] = i.find("div", "info").a['title']   # 书本名字
             books["pub"] = i.find("div", "info").find("div", "pub").text # 出版信息
             books["score"] = i.find("span", "rating_nums").text if(i.find("span", "rating_nums")) else ""  # 评分
             books["comment_num"] = i.find("span", "pl").text # 评论人数
             books["detail"] = i.find("p", "").text if(i.find("p", "")) else ""# 书本详情
             books["link"] = i.find("div","pic").find("a").get("href")   # 书本详情页链接
             books["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 书本海报地址
             lists.append(books) # 保存到返回数组中
         time.sleep(3.5)
    return lists

if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(100) NOT NULL,
        pub VARCHAR(100) NOT NULL,
        score VARCHAR(100) NOT NULL,
        comment_num VARCHAR(100) NOT NULL,
        detail VARCHAR(300) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    a = 0
    for start in range(0,1000,20):
        lists = get_books(start)# 获取提取到数据
        print(start)
        for i in lists:
             # 插入数据到数据库sql语句，%s用作字符串占位
            sql = "INSERT INTO `douban_books`(`name`,`pub`,`score`,`comment_num`,`detail`,`link`,`poster`) VALUES(%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["pub"], i["score"], i["comment_num"], i["detail"], i["link"], i["poster"]))
                db.commit()
                print(i["name"]+" is success")
                a+=1
                print(a)
            except:
                db.rollback()
        time.sleep(3.5)
    db.close()

在上面的基础上改进一下，爬取豆瓣图书热门标签的所有图书，一共120个

import pymysql
import requests
from bs4 import BeautifulSoup
import time


tagUrl = "https://book.douban.com/tag/?view=cloud" 
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}


def link_title():
    url = tagUrl
    lists = []
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.content, "html.parser")
    items = soup.find("table", "tagCol").find_all("tr")
    for i in items:
         lines = i.find_all("td")
         for j in lines:
              books = {}      
              books["title"]=j.a.string  
              books["num"]=j.b.string 
              lists.append(books["title"])# 得到标签列表,一共120个热门标签
    return lists


def get_books(start):
    #%d用作数字占位
    url = "https://book.douban.com/tag/待选?start=%d&type=T"
    url_list = link_title()
    lists = []
    for i in url_list:
         tag = i
         baseUrl = url.replace('待选',i)# 用标签替换
         full_url = baseUrl % start
         html = requests.get(full_url,headers=headers)
         soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
         items = soup.find("ul", "subject-list").find_all("li")# 获取所有的书本内容
         for i in items:
             books = {}      # 临时存取书本的数据
             books["tag"] = tag   # 书本标签
             books["name"] = i.find("div", "info").a['title']   # 书本名字
             books["pub"] = i.find("div", "info").find("div", "pub").text # 出版信息
             books["score"] = i.find("span", "rating_nums").text if(i.find("span", "rating_nums")) else ""  # 评分
             books["comment_num"] = i.find("span", "pl").text # 评论人数
             books["detail"] = i.find("p", "").text if(i.find("p", "")) else ""# 书本详情
             books["link"] = i.find("div","pic").find("a").get("href")   # 书本详情页链接
             books["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 书本海报地址
             lists.append(books) # 保存到返回数组中
         time.sleep(3.5)
    return lists

if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books_alltags")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books_alltags(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        tag VARCHAR(100) NOT NULL,
        name VARCHAR(100) NOT NULL,
        pub VARCHAR(100) NOT NULL,
        score VARCHAR(100) NOT NULL,
        comment_num VARCHAR(100) NOT NULL,
        detail VARCHAR(300) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    a = 0
    for start in range(0,1000,20):
        lists = get_books(start)# 获取提取到数据
        print(start)
        for i in lists:
             # 插入数据到数据库sql语句，%s用作字符串占位
            sql = "INSERT INTO `douban_books_alltags`(`tag`,`name`,`pub`,`score`,`comment_num`,`detail`,`link`,`poster`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["tag"], i["name"], i["pub"], i["score"], i["comment_num"], i["detail"], i["link"], i["poster"]))
                db.commit()
                print(i["name"]+" is success")
                a+=1
                print(a)
            except:
                db.rollback()
        time.sleep(3.5)
    db.close()

爬豆瓣电影TOP250，参考

import pymysql
import requests
from bs4 import BeautifulSoup


#%d用作数字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
    for i in items:
        movie = {}      # 临时存取电影的数据
        movie["rank"] = i.find("em").text   # 电影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
        movie["name"] = i.find("span", "title").text    # 电影名字
        movie["score"] = i.find("span", "rating_num").text  # 电影评分
        movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评，没有就设为空
        movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
        movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
        lists.append(movie) # 保存到返回数组中
    return lists

if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        other VARCHAR(100) NOT NULL,
        quote VARCHAR(50),
        detail VARCHAR(300) NOT NULL,
        comment_num VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,250,25):
        lists = get_movies(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句，%s用作字符串占位
            sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["comment_num"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
    db.close()

将豆瓣爬下来的电影详情按年份、国家或地区、类型等分好并写入MySQL数据库

import pymysql
import requests
from bs4 import BeautifulSoup
import re

#%d用作数字占位
baseUrl = "https://movie.douban.com/top250?start=%d&filter="   
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ol", "grid_view").find_all("li")# 获取所有的电影内容
    for i in items:
        movie = {}      # 临时存取电影的数据
        movie["rank"] = i.find("em").text   # 电影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 电影详情页链接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 电影海报地址
        movie["name"] = i.find("span", "title").text    # 电影名字
        movie["score"] = i.find("span", "rating_num").text  # 电影评分
        movie["other"] = i.find("span", "other").text.replace('/','').replace('    ','/')  # 电影别名
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些电影没有点评，没有就设为空
        movie["comment_num"] = i.find("div", "star").find_all('span')[3].text # 电影评论人数
        movie["detail"] = i.find("div", "bd").find("p", "").text # 电影详情
        lists.append(movie) # 保存到返回数组中
    return lists


if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        other VARCHAR(100) NOT NULL,
        quote VARCHAR(50),
        detail VARCHAR(300) NOT NULL,
        time VARCHAR(300) NOT NULL,
        country VARCHAR(300) NOT NULL,
        type VARCHAR(300) NOT NULL,
        drictor_artist VARCHAR(300) NOT NULL,
        comment_num VARCHAR(100) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(0,250,25):
        lists = get_movies(start)# 获取提取到数据
        data=[]
        for i in lists:
             action = i["detail"]
             remove=re.compile(r'                            |\n|</br>|\.*')
             bd=re.sub(remove,"",action)
             bd=re.sub('<br>',"   ",bd)#去掉<br>
             bd=re.sub('/',"   ",bd)#替换/
             words=bd.split("   ")
             for s in words:
                  if len(s)!=0 and s!=' ':#去掉空白内容
                        data.append(s)
             i["time"] = data[-3][-5:]
             i["country"] = data[-2]
             i["type"] = data[-1]
             i["drictor_artist"] = data[0]
             # 插入数据到数据库sql语句，%s用作字符串占位
             sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`other`,`quote`,`detail`,`time`,`country`,`type`,`drictor_artist`,`comment_num`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
             try:
                 cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["other"], i["quote"], i["detail"], i["time"], i["country"], i["type"], i["drictor_artist"], i["comment_num"]))
                 db.commit()
                 print(i["name"]+" is success")
             except:
                 db.rollback()
    db.close()

可以将TOP250电影的年份画出来

豆瓣电影TOP250年代分布

爬链家二手房,数据写入MySQL

import pymysql
import requests
from bs4 import BeautifulSoup
import time


#%d用作数字占位
baseUrl = "http://gz.lianjia.com/ershoufang/pg%d/"   
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

def get_books(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ul", "sellListContent").find_all("li")# 获取所有的书本内容
    for i in items:
        books = {}      # 临时存取电影的数据
        books["title"]=i.find("div","title").a.string  #雅逸庭3房出售 单边位 带主套 免交个税
        books["link"]=i.find("div","title").a.get('href')  #http://gz.lianjia.com/ershoufang/GZ0002556701.html
        books["address_xiaoqu"]=i.find("div","houseInfo").a.string  #广州雅居乐花园雅逸庭 
        books["address_info"]= i.find("div","houseInfo").a.next_sibling  # | 3室2厅 | 109.5平米 | 东北 | 精装 | 无电梯
        books["flood"]= i.find("div","flood").find("div","positionInfo").span.next_sibling  #中楼层(共6层)2010年建塔楼  -  
        books["area"]= i.find("div","flood").find("div","positionInfo").a.string  #华南
        books["total_price"] = i.find("div","totalPrice").find("span").text   # 总价
        books["mean_price"] = i.find("div", "unitPrice").find("span").text  # 均价
        books["followInfo"]= i.find("div","followInfo").span.next_sibling  #103人关注 / 共42次带看 / 29天以前发布
        lists.append(books) # 保存到返回数组中
    return lists

if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        title VARCHAR(100) NOT NULL,
        link VARCHAR(100) NOT NULL,
        address_xiaoqu VARCHAR(100) NOT NULL,
        address_info VARCHAR(100) NOT NULL,
        flood VARCHAR(100) NOT NULL,
        area VARCHAR(300) NOT NULL,
        total_price VARCHAR(50) NOT NULL,
        mean_price VARCHAR(100) NOT NULL,
        followInfo VARCHAR(200) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(1,100,1):
        lists = get_books(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句，%s用作字符串占位
            sql = "INSERT INTO `douban_books`(`title`,`link`,`address_xiaoqu`,`address_info`,`flood`,`area`,`total_price`,`mean_price`,`followInfo`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["title"], i["link"], i["address_xiaoqu"], i["address_info"], i["flood"], i["area"], i["total_price"], i["mean_price"], i["followInfo"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
        time.sleep(3.5)
    db.close()

链家二手房数据

对链家数据进行处理

# -*- coding: utf-8 -*-
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt

  
#读取csv文件  
df = pd.read_csv('b.csv', encoding='gbk') 
house = pd.DataFrame(df)  


#对房源信息进行分列  
houseinfo_split = pd.DataFrame((x.split('|') for x in house.address_info),index=house.index,columns=['xiaoqu','huxing','mianji','chaoxiang','zhuangxiu','dianti'])  
print(houseinfo_split.head())  


#将分列结果拼接回原数据表  
house=pd.merge(house,houseinfo_split,right_index=True, left_index=True)  
#对房源关注度进行分列  
followinfo_split = pd.DataFrame((x.split('/') for x in house.followInfo),index=house.index,columns=['guanzhu','daikan','fabu'])  
#将分列后的关注度信息拼接回原数据表  
house=pd.merge(house,followinfo_split,right_index=True, left_index=True)  
#按房源户型类别进行汇总  
huxing=house.groupby('huxing')['huxing'].agg(len)  
#查看户型汇总结果  
print(huxing)  
  

#画图了一下内容中有多个画图，运行时请注释其他的画图代码，避免互相影响  
#绘制房源户型分布条形图  
plt.rc('font', family='STXihei', size=11)  
a=np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])  
plt.barh([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],huxing,color='#052B6C',alpha=0.8,align='center',edgecolor='white')  
plt.ylabel('house type')  
plt.xlabel('number')  
plt.xlim(0,400)  
plt.ylim(0,20)  
plt.title('Housing family distribution')  
plt.legend(['Number'], loc='upper right')  
plt.grid(color='#95a5a6',linestyle='--', linewidth=2,axis='y',alpha=0.4)  
plt.yticks(a,('13室0厅','15室6厅','1室0厅','1室1厅','1室2厅','2室1厅','2室2厅','3室1厅','3室2厅','4室1厅','4室2厅','4室3厅','5室0厅','5室1厅','5室2厅','5室3厅','6室2厅','6室3厅','7室2厅','9室9厅'))  
plt.show()  
#需要注意的是根据huxing输出的内容，设置a=np.array([1,2,3,4,5,6,7,8,9])的数量，我这里是9段分布，所以使用1-9，plt.yticks部分也是一样的  
# 1shi0ting为 1室0厅，1shi1ting为1室1厅，以此类推  


#对房源面积进行二次分列  
mianji_num_split = pd.DataFrame((x.split('平') for x in house.mianji),index=house.index,columns=['mianji_num','mi'])  
#将分列后的房源面积拼接回原数据表  
house = pd.merge(house,mianji_num_split,right_index=True,left_index=True)  
#去除mianji_num字段两端的空格  
house['mianji_num'] = house['mianji_num'].map(str.strip) 


#更改mianji_num字段格式为float  
house['mianji_num'] = house['mianji_num'].astype(float)  
#查看所有房源面积的范围值  
print(house['mianji_num'].min(),house['mianji_num'].max()) 
  

#对房源面积进行分组  
bins = [0, 50, 100, 150, 200, 250, 300, 350]  
group_mianji = ['less than 50', '50-100', '100-150', '150-200','200-250','250-300','300-350']  
house['group_mianji'] = pd.cut(house['mianji_num'], bins, labels=group_mianji)  
#按房源面积分组对房源数量进行汇总  
group_mianji=house.groupby('group_mianji')['group_mianji'].agg(len)  
  

#绘制房源面积分布图 需要去掉注释  
plt.rc('font', family='STXihei', size=15)  
a=np.array([1,2,3,4,5,6,7])  
plt.barh([1,2,3,4,5,6,7],group_mianji,color='#052B6C',alpha=0.8,align='center',edgecolor='white')  
plt.ylabel('mianji group')  
plt.xlabel('number')  
plt.title('Housing area of distribution')  
plt.legend(['number'], loc='upper right')  
plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4)  
plt.yticks(a,('less 50', '50-100', '100-150', '150-200','200-250','250-300','300-350'))  
plt.show()  
  

#对房源关注度进行二次分列  
guanzhu_num_split = pd.DataFrame((x.split('人') for x in house.guanzhu),index=house.index,columns=['guanzhu_num','ren'])  
#将分列后的关注度数据拼接回原数据表  
house=pd.merge(house,guanzhu_num_split,right_index=True, left_index=True)  
#去除房源关注度字段两端的空格  
house['guanzhu_num']=house['guanzhu_num'].map(str.strip)  
#更改房源关注度及总价字段的格式  
house[['guanzhu_num','total_price']]=house[['guanzhu_num','total_price']].astype(float)  
#查看房源关注度的区间  
print(house['guanzhu_num'].min(),house['guanzhu_num'].max())  
  

#对房源关注度进行分组，这里的bins也需要根据上边的min()和max()输出值进行设置  
bins = [0, 20, 50, 80, 200, 500]  
group_guanzhu = ['小于20', '20-50', '50-80', '80-200','200-500']  
house['group_guanzhu'] = pd.cut(house['guanzhu_num'], bins, labels=group_guanzhu)  
group_guanzhu=house.groupby('group_guanzhu')['group_guanzhu'].agg(len)  
  

#绘制房源关注度分布图，去除注释  
plt.rc('font', family='STXihei', size=15)  
a=np.array([1,2,3,4,5])  
plt.barh([1,2,3,4,5],group_guanzhu,color='#052B6C',alpha=0.8,align='center',edgecolor='white')  
plt.ylabel('Interest groups')  
plt.xlabel('Number')  
plt.xlim(0,1000)  
plt.title('Housing attention distribution')  
plt.legend(['Number'], loc='upper right')  
plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='y',alpha=0.4)  
plt.yticks(a,('less 20', '20-50', '50-80', '80-200', '200-500'))  
plt.show()


house = pd.DataFrame((x.split('元') for x in house.mean_price),index=df.index,columns=['mean_price','pingmi'])
house = pd.DataFrame((x.split('价') for x in house.mean_price),index=df.index,columns=['danjia','mean_price'])
house[['mean_price']]=house[['mean_price']].astype(int)
print(house['mean_price'].min(),house['mean_price'].max())
plt.plot(house['mean_price'])

房型数量分布

面积数量分布

带看量分布

均价分布

爬链家二手房100页数据共2970个：

import pymysql
import requests
from bs4 import BeautifulSoup
import time


#%d用作数字占位
baseUrl = "http://gz.lianjia.com/ershoufang/pg%d/"   
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

def get_books(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.content, "html.parser")# BeautifulSoup解析页面内容
    items = soup.find("ul", "sellListContent").find_all("li")# 获取所有的书本内容
    for i in items:
        books = {}      # 临时存取电影的数据
        books["title"]=i.find("div","title").a.string  #雅逸庭3房出售 单边位 带主套 免交个税
        books["link"]=i.find("div","title").a.get('href')  #http://gz.lianjia.com/ershoufang/GZ0002556701.html
        books["address_xiaoqu"]=i.find("div","houseInfo").a.string  #广州雅居乐花园雅逸庭 
        books["address_info"]= i.find("div","houseInfo").a.next_sibling  # | 3室2厅 | 109.5平米 | 东北 | 精装 | 无电梯
        books["flood"]= i.find("div","flood").find("div","positionInfo").span.next_sibling  #中楼层(共6层)2010年建塔楼  -  
        books["area"]= i.find("div","flood").find("div","positionInfo").a.string  #华南
        books["total_price"] = i.find("div","totalPrice").find("span").text   # 总价
        books["mean_price"] = i.find("div", "unitPrice").find("span").text  # 均价
        books["followInfo"]= i.find("div","followInfo").span.next_sibling  #103人关注 / 共42次带看 / 29天以前发布
        lists.append(books) # 保存到返回数组中
        time.sleep(3.5)
    return lists

if __name__ == "__main__":
     # 连接数据库，需指定charset否则可能会报错
    db = pymysql.connect(host="localhost",user="root",password="root",db="new_schema",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS douban_books")# 如果表存在则删除
    # 创建表sql语句
    createTab = """CREATE TABLE douban_books(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        title VARCHAR(100) NOT NULL,
        link VARCHAR(100) NOT NULL,
        address_xiaoqu VARCHAR(100) NOT NULL,
        address_info VARCHAR(100) NOT NULL,
        flood VARCHAR(100) NOT NULL,
        area VARCHAR(300) NOT NULL,
        total_price VARCHAR(50) NOT NULL,
        mean_price VARCHAR(100) NOT NULL,
        followInfo VARCHAR(200) NOT NULL
    )"""
    cursor.execute(createTab)
    for start in range(1,100,1):
        lists = get_books(start)# 获取提取到数据
        for i in lists:
             # 插入数据到数据库sql语句，%s用作字符串占位
            sql = "INSERT INTO `douban_books`(`title`,`link`,`address_xiaoqu`,`address_info`,`flood`,`area`,`total_price`,`mean_price`,`followInfo`) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["title"], i["link"], i["address_xiaoqu"], i["address_info"], i["flood"], i["area"], i["total_price"], i["mean_price"], i["followInfo"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
        time.sleep(3.5)
    db.close()

BeautifulSoup4爬虫练习

猜你喜欢

热点阅读