python爬虫抓取正方教务系统信息

2017-04-01  本文已影响1011人  海上牧云l

学校用的正方教务系统,参考了一些网上的例子,写了一个抓取学生信息和课表的爬虫。这里主要用到了requests发送请求和bs4解析页面。数据库用的mysql,用peewee操作数据库。

网上有说可以绕过验证码,但是那个bug貌似被修复了,就把验证码图片保存到本地,需要手动输入。

往数据库存储的麻烦了点,主要课程表那个表格有个上午下午,上午归到第一节那一行,下午归到下午第一节那一行,这样表的格式就乱了,也没想到好的方法,就先这样了。

import requests
from lxml import etree
import urllib.parse
from bs4 import BeautifulSoup
from peewee import CharField, Model, MySQLDatabase, OperationalError


db = MySQLDatabase(
    host='127.0.0.1',
    user='root',
    passwd='root',
    database='blog',
    port=3306
)
db.connect()


class Student(Model):
    stu_num = CharField()
    name = CharField()
    stu_class = CharField()
    faculty = CharField()
    major = CharField()

    class Meta:
        database = db


class Class(Model):
    name = CharField(max_length=100, null=True)
    type = CharField(max_length=10, null=True)
    mon = CharField(null=True)
    the = CharField(null=True)
    wed = CharField(null=True)
    thu = CharField(null=True)
    fri = CharField(null=True)
    sat = CharField(null=True)
    sun = CharField(null=True)

    class Meta:
        database = db

try:
    db.create_tables([Student, Class])
except OperationalError:
    pass


class Qlu:
    def __init__(self):
        self.base_url = 'http://210.44.159.22/default2.aspx'
        self.session = requests.session()

    def login(self):

        response = self.session.get(self.base_url)
        selector = etree.HTML(response.content)
        __VIEWSTATE = selector.xpath('//*[@id="form1"]/input/@value')[0]

        username = input('输入学号')
        password = input('输入密码')

        # 保存验证码图片
        check_img_url = 'http://210.44.159.22/CheckCode.aspx'
        img_resp = self.session.get(check_img_url, stream=True)
        image = img_resp.content
        with open('check.jpg', 'wb') as f:
            f.write(image)
        txtSecretCode = input('输入验证码')

        data = {
            '__VIEWSTATE': __VIEWSTATE,
            'txtUserName': username,
            'TextBox2': password,
            'txtSecretCode': txtSecretCode,
            'RadioButtonList1': '(unable to decode value)',
            'Button1': '',
            'lbLanguage': '',
            'hidPdrs': '',
            'hidsc': '',
        }
        loginResp = self.session.post(self.base_url, data=data)
        # 登录后的url
        info_url = 'http://210.44.159.22/xs_main.aspx?xh={}'.format(username)
        self.session.headers['Referer'] = info_url

        # 登陆后页面信息
        login_page = self.session.get(info_url)
        soup = BeautifulSoup(login_page.text, 'lxml')

        # 学生姓名
        name = soup.select('#xhxm')[0].get_text()[:-2]
        self.get_schedule(username, name)

    def get_stu_info(self, soup):
        stu_num = soup.select('#Label5')[0].get_text()  # 学号
        name = soup.select('#Label6')[0].get_text()  # 姓名
        faculty = soup.select('#Label7')[0].get_text()  # 院系
        major = soup.select('#Label8')[0].get_text()  # 专业
        class1 = soup.select('#Label9')[0].get_text()  # 班级
        Student.create(stu_num=stu_num, name=name, stu_class=class1, faculty=faculty, major=major)

    # 得到学生课表
    def get_schedule(self, username, name):
        # 把学生名字编码
        name = urllib.parse.quote_plus(name.encode('gb2312'))
        # 请求课表的url
        req_url = 'http://210.44.159.22/xskbcx.aspx?xh=' + username + '&xm=' + name + '&gnmkdm=N121603'
        response = self.session.get(req_url)
        soup = BeautifulSoup(response.text, 'lxml')
        # 学生基本信息
        self.get_stu_info(soup)

        __VIEWSTATE = soup.select('#xskb_form input')[2].get('value')

        xnd = input('输入学年(例如输入2016,查询2016-2017学年的)')
        xqd = input('输入学期(1或2)')

        data = {
            '__EVENTTARGET': 'xnd',
            '__EVENTARGUMENT': '',
            '__VIEWSTATE': __VIEWSTATE,
            'xnd': str(xnd) + '-' + str(int(xnd)+1),         # 学年 2015-2016
            'xqd': xqd          # 学期
        }

        self.session.headers['Referer'] = 'http://210.44.159.22/xs_main.aspx?xh={}'.format(username)
        resp = self.session.post(req_url, data=data)
        soup = BeautifulSoup(resp.text, 'lxml')
        tr = soup.select('tr')

        year = str(xnd) + '-' + str(int(xnd)+1) + '学年'
        term = '第' + str(xqd) + '学期'
        # 数据表名
        table_name = year + '/' + term

        # 第1、2节
        fir = tr[4].select('td')
        Class.create(name=table_name, type='1、2节', mon=fir[2].get_text(), the=fir[3].get_text(), wed=fir[4].get_text(), thu=fir[5].get_text(), fri=fir[6].get_text(), sat=fir[7].get_text(), sun=fir[8].get_text())

        # 第3、4节
        sec = tr[6].select('td')
        Class.create(mon=sec[1].get_text(), type='3、4节', the=sec[2].get_text(), wed=sec[3].get_text(), thu=sec[4].get_text(), fri=sec[5].get_text(), sat=sec[6].get_text(), sun=sec[7].get_text())

        # 第5、6节
        thi = tr[8].select('td')
        Class.create(mon=thi[2].get_text(), type='5、6节', the=thi[3].get_text(), wed=thi[4].get_text(), thu=thi[5].get_text(), fri=thi[6].get_text(), sat=thi[7].get_text(), sun=thi[8].get_text())

        # 第7、8节
        fou = tr[10].select('td')
        Class.create(mon=fou[1].get_text(), type='7、8节', the=fou[2].get_text(), wed=fou[3].get_text(), thu=fou[4].get_text(), fri=fou[5].get_text(), sat=fou[6].get_text(), sun=fou[7].get_text())

        # 第9、10节
        fiv = tr[12].select('td')
        Class.create(mon=fiv[2].get_text(), type='9、10节', the=fiv[3].get_text(), wed=fiv[4].get_text(), thu=fiv[5].get_text(), fri=fiv[6].get_text(), sat=fiv[7].get_text(), sun=fiv[8].get_text())

        # 第11节
        sev = tr[14].select('td')
        Class.create(mon=sev[1].get_text(), type='11节', the=sev[2].get_text(), wed=sev[3].get_text(), thu=sev[4].get_text(), fri=sev[5].get_text(), sat=sev[6].get_text(), sun=sev[7].get_text())


spi = Qlu()
spi.login()


上一篇 下一篇

猜你喜欢

热点阅读