Pythoner集中营

Python-requests-Github模拟登陆

2018-05-22  本文已影响40人  p吊车尾

用简书很久了, 从来都是看人家写的文章,参考别人的代码, 从小学开始, 一让写文章就不知道该如何下手的我, 居然突然想写起文章来,虽然不是特别很有文采的那种, 而是一些与python爬虫有关的文章,每天坚持写写,坚持下去希望可以有很好的改变。

最重要的是希望可以碰到大神, 得到高人的指点哈哈哈哈

也希望能帮助到一些人,虽然本人的技术本来也不咋滴,不喜勿喷。。。

第一篇就分享下用python模拟登陆下github,搜索关键字。

image image

1.commit :Sign in

2.utf8: ✓

3.authenticity_token :vWja9qtBNFkrLDFwb3r+BYVTPak9Cp13Png1Wlyxz9UoAqqXDscmkqxkFK2PHnGtXx9xmctOORJr7hzL8WWIkQ==

4.login : 账号

5.passw: 密码

image
# -*- coding: utf-8 -*-
import re
import requests
from pyquery import PyQuery as pq
from urllib.parse import urljoin
import logging
logging.captureWarnings(True)

default_headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9'
}
class github_pro:

    def __init__(self):
        self.sess = requests.Session()
        #写代码的时候发现要验证ssl证书,这里我们跳过验证
        self.sess.verify = False
        #不允许重定向
        self.sess.allow_redirects = False
        #给我们的session添加要给headers
        self.sess.headers = default_headers
        self.authenticitytoken_url = 'https://github.com/login'
        self.login_url = 'https://github.com/session'
        self.user = '你的账号'
        self.passwd = '你的密码'
        #设置一个初始状态,登陆成功后改为True
        self.login_status = False
        self.search_url = 'https://github.com/search'
        self.baseurl = 'https://github.com'

    def authenticitytoken(self):
        '''
        得到我们需要的authenticity_token参数
        :return: 
        '''
        res = self.sess.get(self.authenticitytoken_url).text
        response_html = pq(res)
        authenticity_token = response_html.find('input[@name="authenticity_token"]').attr.value
        return authenticity_token

    def github_login(self):
        '''
        登陆
        :return: 
        '''
        login_data = {
            'commit': 'Sign in',
            'utf8': '✓',
            'authenticity_token': self.authenticitytoken(),
            'login': self.user,
            'password': self.passwd
        }
        res = self.sess.post(self.login_url, data=login_data).text
        if re.findall(r'', res):
            print('登陆成功...')
            self.login_status = True
        else:
            print('登陆失败,正在重试...')
            self.github_login()

    def search_github(self, keywords):
        '''
        按照关键字进行搜索
        :param keywords: 
        :return: 
        '''
        search_params = {
            'utf8': '✓',
            'q': keywords,
            'type': ''
        }
        result = self.sess.get(self.search_url, params=search_params).text
        return result

    def detail_page(self, response):
        '''
        抓取得到的内容粗略处理下
        :param response: 页面html
        :return: 
        '''
        response_pq = pq(response)
        for each in response_pq.find('div[@class="col-8 pr-3"]').items():
            title = each.find('h3 a').text()
            url = each.find('h3 a').attr.href
            if not str(url).startswith('http'):
                url = urljoin(self.baseurl, url)
            update_date = re.sub(r'T.*','',each.find('relative-time').attr.datetime)
            print(title)
            print(url)
            print(update_date)
            print('*'*200)

    def mycrawler(self, response, keywords):
        '''
        分页
        :param response: 页面的html
        :param keywords: 搜索关键字
        :return: 
        '''
        response_pq = pq(response)
        page_num_list = [page_num.text() for page_num in response_pq.find('div[@class="pagination"] a').items() if re.match(r'^\d+$',page_num.text())]
        max_page_num = max(page_num_list)
        print(max_page_num)
        for num in range(1, int(max_page_num)+1):
            print('正在抓取第{}页..'.format(num))
            params = {
                'p': num,
                'q': keywords,
                'type': 'Repositories',
                'utf8': '✓',
            }
            result = self.sess.get(self.search_url, params=params).text
            self.detail_page(result)


if __name__ == '__main__':
    tt = github_pro()
    tt.github_login()
    keywords = input('Please input your search keywords:')
    response_html = tt.search_github(keywords)
    tt.mycrawler(response_html, keywords)

总结:

上一篇下一篇

猜你喜欢

热点阅读