python爬虫抓取正方教务系统信息
2017-04-01 本文已影响1011人
海上牧云l
学校用的正方教务系统,参考了一些网上的例子,写了一个抓取学生信息和课表的爬虫。这里主要用到了requests发送请求和bs4解析页面。数据库用的mysql,用peewee操作数据库。
网上有说可以绕过验证码,但是那个bug貌似被修复了,就把验证码图片保存到本地,需要手动输入。
往数据库存储的麻烦了点,主要课程表那个表格有个上午下午,上午归到第一节那一行,下午归到下午第一节那一行,这样表的格式就乱了,也没想到好的方法,就先这样了。
import requests
from lxml import etree
import urllib.parse
from bs4 import BeautifulSoup
from peewee import CharField, Model, MySQLDatabase, OperationalError
db = MySQLDatabase(
host='127.0.0.1',
user='root',
passwd='root',
database='blog',
port=3306
)
db.connect()
class Student(Model):
stu_num = CharField()
name = CharField()
stu_class = CharField()
faculty = CharField()
major = CharField()
class Meta:
database = db
class Class(Model):
name = CharField(max_length=100, null=True)
type = CharField(max_length=10, null=True)
mon = CharField(null=True)
the = CharField(null=True)
wed = CharField(null=True)
thu = CharField(null=True)
fri = CharField(null=True)
sat = CharField(null=True)
sun = CharField(null=True)
class Meta:
database = db
try:
db.create_tables([Student, Class])
except OperationalError:
pass
class Qlu:
def __init__(self):
self.base_url = 'http://210.44.159.22/default2.aspx'
self.session = requests.session()
def login(self):
response = self.session.get(self.base_url)
selector = etree.HTML(response.content)
__VIEWSTATE = selector.xpath('//*[@id="form1"]/input/@value')[0]
username = input('输入学号')
password = input('输入密码')
# 保存验证码图片
check_img_url = 'http://210.44.159.22/CheckCode.aspx'
img_resp = self.session.get(check_img_url, stream=True)
image = img_resp.content
with open('check.jpg', 'wb') as f:
f.write(image)
txtSecretCode = input('输入验证码')
data = {
'__VIEWSTATE': __VIEWSTATE,
'txtUserName': username,
'TextBox2': password,
'txtSecretCode': txtSecretCode,
'RadioButtonList1': '(unable to decode value)',
'Button1': '',
'lbLanguage': '',
'hidPdrs': '',
'hidsc': '',
}
loginResp = self.session.post(self.base_url, data=data)
# 登录后的url
info_url = 'http://210.44.159.22/xs_main.aspx?xh={}'.format(username)
self.session.headers['Referer'] = info_url
# 登陆后页面信息
login_page = self.session.get(info_url)
soup = BeautifulSoup(login_page.text, 'lxml')
# 学生姓名
name = soup.select('#xhxm')[0].get_text()[:-2]
self.get_schedule(username, name)
def get_stu_info(self, soup):
stu_num = soup.select('#Label5')[0].get_text() # 学号
name = soup.select('#Label6')[0].get_text() # 姓名
faculty = soup.select('#Label7')[0].get_text() # 院系
major = soup.select('#Label8')[0].get_text() # 专业
class1 = soup.select('#Label9')[0].get_text() # 班级
Student.create(stu_num=stu_num, name=name, stu_class=class1, faculty=faculty, major=major)
# 得到学生课表
def get_schedule(self, username, name):
# 把学生名字编码
name = urllib.parse.quote_plus(name.encode('gb2312'))
# 请求课表的url
req_url = 'http://210.44.159.22/xskbcx.aspx?xh=' + username + '&xm=' + name + '&gnmkdm=N121603'
response = self.session.get(req_url)
soup = BeautifulSoup(response.text, 'lxml')
# 学生基本信息
self.get_stu_info(soup)
__VIEWSTATE = soup.select('#xskb_form input')[2].get('value')
xnd = input('输入学年(例如输入2016,查询2016-2017学年的)')
xqd = input('输入学期(1或2)')
data = {
'__EVENTTARGET': 'xnd',
'__EVENTARGUMENT': '',
'__VIEWSTATE': __VIEWSTATE,
'xnd': str(xnd) + '-' + str(int(xnd)+1), # 学年 2015-2016
'xqd': xqd # 学期
}
self.session.headers['Referer'] = 'http://210.44.159.22/xs_main.aspx?xh={}'.format(username)
resp = self.session.post(req_url, data=data)
soup = BeautifulSoup(resp.text, 'lxml')
tr = soup.select('tr')
year = str(xnd) + '-' + str(int(xnd)+1) + '学年'
term = '第' + str(xqd) + '学期'
# 数据表名
table_name = year + '/' + term
# 第1、2节
fir = tr[4].select('td')
Class.create(name=table_name, type='1、2节', mon=fir[2].get_text(), the=fir[3].get_text(), wed=fir[4].get_text(), thu=fir[5].get_text(), fri=fir[6].get_text(), sat=fir[7].get_text(), sun=fir[8].get_text())
# 第3、4节
sec = tr[6].select('td')
Class.create(mon=sec[1].get_text(), type='3、4节', the=sec[2].get_text(), wed=sec[3].get_text(), thu=sec[4].get_text(), fri=sec[5].get_text(), sat=sec[6].get_text(), sun=sec[7].get_text())
# 第5、6节
thi = tr[8].select('td')
Class.create(mon=thi[2].get_text(), type='5、6节', the=thi[3].get_text(), wed=thi[4].get_text(), thu=thi[5].get_text(), fri=thi[6].get_text(), sat=thi[7].get_text(), sun=thi[8].get_text())
# 第7、8节
fou = tr[10].select('td')
Class.create(mon=fou[1].get_text(), type='7、8节', the=fou[2].get_text(), wed=fou[3].get_text(), thu=fou[4].get_text(), fri=fou[5].get_text(), sat=fou[6].get_text(), sun=fou[7].get_text())
# 第9、10节
fiv = tr[12].select('td')
Class.create(mon=fiv[2].get_text(), type='9、10节', the=fiv[3].get_text(), wed=fiv[4].get_text(), thu=fiv[5].get_text(), fri=fiv[6].get_text(), sat=fiv[7].get_text(), sun=fiv[8].get_text())
# 第11节
sev = tr[14].select('td')
Class.create(mon=sev[1].get_text(), type='11节', the=sev[2].get_text(), wed=sev[3].get_text(), thu=sev[4].get_text(), fri=sev[5].get_text(), sat=sev[6].get_text(), sun=sev[7].get_text())
spi = Qlu()
spi.login()