2018-07-11

2018-07-11  本文已影响0人  加勒比海带_4bbc

coding=utf-8

import requests

from bs4 import BeautifulSoup

import re

import pymongo

import json

import pandas as pd

import numpy as np

import xlrd

import datetime

from pyecharts import Line,Grid,EffectScatter,Overlap
from pyecharts.echarts.axis import YAxisLabel

def onedayPriceSpider(spiderDay):

for m in range(1,4):
url = "http://www.hfzgncp.com.cn/index.php?m=content&c=index&a=lists&catid=59&sendtime="+str(spiderDay)+"&page="+m

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)' +\

          'Gecko/20100101 Firefox/61.0'}

r = requests.get(url,headers=headers)

html = r.text.encode(encoding='utf_8').decode()

soup = BeautifulSoup(html,"lxml")

table = soup.find('table', attrs={'class' :{'h_list_table r_list_table'}})

#print(table)

#查看表格数据行数

height = len(table.findAll(lambda tag:tag.name=='tr' and

                          len(tag.findAll('td'))>=1))
if height <=0:
    return

print('height:'+str(height))

#收集表头

columns = [x.text for x in table.tr.findAll('th')]

columns = [x.replace('\xa0','') for x in columns]

#print(columns)

#构造dataframe 准备存储表格

width = len(columns)  #表头列数

#print('width:'+str(width))

df = pd.DataFrame(data = np.full((height,width),'',dtype = 'U'),columns = columns)

rows = [row for row in table.findAll('tr') if row.find('td')!=None]

#逐行分析表格

for i in range(len(rows)):

    cells = rows[i].findAll('td')

    if len(cells) == width:

        df.iloc[i] = [cell.text.replace(' ','').replace('\n','') for cell in cells]  #去点空格和换行

    else:

        w=len(cells)

        df.iloc[i,width-w:] = [cell.text.replace(' ','').replace('\n','') for cell in cells]

#print(df)

sendtime = soup.find('input', attrs={'id' :{'sendtime'}})['value'].rstrip('/-') #获取数据时间

sendtimeStr=re.sub("\-","",sendtime)

#print(sendtimeStr)

outputfilePath="D:/xlsx/"+sendtimeStr+".xlsx"

df.to_excel(outputfilePath)

xlsx_data=xlrd.open_workbook(outputfilePath)

xlsx_table=xlsx_data.sheet_by_name(u'Sheet1')

columnIndex = None

rowIndex = None 



for j in range(xlsx_table.ncols):       

    for i in range(height):

        if(xlsx_table.cell_value(i, j) == '平均价'):

            columnIndex = j

            break       

        if(xlsx_table.cell_value(i, j) == '草鱼'):

            rowIndex = i

            break

print(xlsx_table.cell_value(rowIndex, columnIndex))





price_data.append(xlsx_table.cell_value(rowIndex, columnIndex))
price_date.append(sendtime)
#client= pymongo.MongoClient()

#获取一个数据库

#db=client.priceSpider

#创建 或获取一个集合,并在collection下新建books

#account=db.prcie

#data=xlrd.open_workbook("D:/"+sendtimeStr+".xlsx")

#table=data.sheets()[0]

#读取excel第一行数据作为存入mongodb的字段名

#rowstag=table.row_values(0)

#nrows=table.nrows

#print('-------------nrows----------------'+str(nrows))

#ncols=table.ncols #print rows

#returnData={}

#for i in range(1,nrows):

    #将字段名和excel数据存储为字典形式,并转换为json格式

    #returnData[i]=json.dumps(dict(zip(rowstag,table.row_values(i))))

    #通过编解码还原数据

    #returnData[i]=json.loads(returnData[i])

    #print returnData[i]

    #account.insert(returnData[i])

    #return daySpider

def getColumnIndex(table, columnName):

columnIndex = None   

for i in range(table.ncols):       

    if(table.cell_value(0, i) == columnName):

        columnIndex = i

    break

return columnIndex

def getPriceLine(price_date,price_data):
attr = price_date
v1 = price_data
line = Line('价格走势图')
yAxis: {
type: 'value',
min: 400,
max: 700
}
line.add('鲫鱼',attr,v1,is_smooth = True,mark_point = ['max','min'],mark_line=["average"],yaxis_formatter="元")
grid =Grid()
grid.add(line,grid_top="10%")

grid.render('./鲫鱼.html')
#es = EffectScatter()
#es.add('',attr,v1,effect_scale=8)   #闪烁
#overlop = Overlap()
#overlop.add(line)                   #必须先添加line,在添加es
#overlop.add(es)
#overlop.render('./line-es01.html')

if name=="main":
price_data = []
price_date = []
spiderDaynumber =10

for i in range(spiderDaynumber):

    spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)   

    print(spiderDay)

    onedayPriceSpider(spiderDay)
print(price_data)
print(price_date)
getPriceLine(price_date,price_data)
#xlsfile= 'F:/test_AutoTesting/TestCase/RunList.xlsx'

#  table = readExcelDataByName(xlsfile, 'Sheet1')[0]

    #获取第一行的值   

  #  testcase_id = table.cell_value(1, getColumnIndex(table,'平均价'))
上一篇下一篇

猜你喜欢

热点阅读