采集wipo

2021-12-13  本文已影响0人  是东东

wipo_js.py

import os


class GetWipoJS(object):

    def _getCurrentState(self):
        dic = {
            "type": "brand",
            "la": "en",
            "qi": "0-1OOf/PeepQuegNdjyPRWQP8uO1YSPDMK7DnyHlO6O/8=",
            "queue": 1,
            "_": "11569",
        }
        return dic

    def get_js_str(self):
        js_str1 = 'var cc = %s;' % self._getCurrentState()
        js_str2 = """
        var LZString = {
            _keyStr: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",
            _f: String.fromCharCode,
            compressToBase64: function(cc) {
                var c = JSON.stringify(cc);
                if (c == null) {
                    return ""
                }
                var a = "";
                var k, h, f, j, g, e, d;
                var b = 0;
                c = LZString.compress(c);
                while (b < c.length * 2) {
                    if (b % 2 == 0) {
                        k = c.charCodeAt(b / 2) >> 8;
                        h = c.charCodeAt(b / 2) & 255;
                        if (b / 2 + 1 < c.length) {
                            f = c.charCodeAt(b / 2 + 1) >> 8
                        } else {
                            f = NaN
                        }
                    } else {
                        k = c.charCodeAt((b - 1) / 2) & 255;
                        if ((b + 1) / 2 < c.length) {
                            h = c.charCodeAt((b + 1) / 2) >> 8;
                            f = c.charCodeAt((b + 1) / 2) & 255
                        } else {
                            h = f = NaN
                        }
                    }
                    b += 3;
                    j = k >> 2;
                    g = ((k & 3) << 4) | (h >> 4);
                    e = ((h & 15) << 2) | (f >> 6);
                    d = f & 63;
                    if (isNaN(h)) {
                        e = d = 64
                    } else {
                        if (isNaN(f)) {
                            d = 64
                        }
                    }
                    a = a + LZString._keyStr.charAt(j) + LZString._keyStr.charAt(g) + LZString._keyStr.charAt(e) + LZString._keyStr.charAt(d)
                }
                return a
            },
            compress: function(e) {
                if (e == null) {
                    return ""
                }
                var h, l, n = {}, m = {}, o = "", c = "", r = "", d = 2, g = 3, b = 2, q = "", a = 0, j = 0, p, k = LZString._f;
                for (p = 0; p < e.length; p += 1) {
                    o = e.charAt(p);
                    if (!Object.prototype.hasOwnProperty.call(n, o)) {
                        n[o] = g++;
                        m[o] = true
                    }
                    c = r + o;
                    if (Object.prototype.hasOwnProperty.call(n, c)) {
                        r = c
                    } else {
                        if (Object.prototype.hasOwnProperty.call(m, r)) {
                            if (r.charCodeAt(0) < 256) {
                                for (h = 0; h < b; h++) {
                                    a = (a << 1);
                                    if (j == 15) {
                                        j = 0;
                                        q += k(a);
                                        a = 0
                                    } else {
                                        j++
                                    }
                                }
                                l = r.charCodeAt(0);
                                for (h = 0; h < 8; h++) {
                                    a = (a << 1) | (l & 1);
                                    if (j == 15) {
                                        j = 0;
                                        q += k(a);
                                        a = 0
                                    } else {
                                        j++
                                    }
                                    l = l >> 1
                                }
                            } else {
                                l = 1;
                                for (h = 0; h < b; h++) {
                                    a = (a << 1) | l;
                                    if (j == 15) {
                                        j = 0;
                                        q += k(a);
                                        a = 0
                                    } else {
                                        j++
                                    }
                                    l = 0
                                }
                                l = r.charCodeAt(0);
                                for (h = 0; h < 16; h++) {
                                    a = (a << 1) | (l & 1);
                                    if (j == 15) {
                                        j = 0;
                                        q += k(a);
                                        a = 0
                                    } else {
                                        j++
                                    }
                                    l = l >> 1
                                }
                            }
                            d--;
                            if (d == 0) {
                                d = Math.pow(2, b);
                                b++
                            }
                            delete m[r]
                        } else {
                            l = n[r];
                            for (h = 0; h < b; h++) {
                                a = (a << 1) | (l & 1);
                                if (j == 15) {
                                    j = 0;
                                    q += k(a);
                                    a = 0
                                } else {
                                    j++
                                }
                                l = l >> 1
                            }
                        }
                        d--;
                        if (d == 0) {
                            d = Math.pow(2, b);
                            b++
                        }
                        n[c] = g++;
                        r = String(o)
                    }
                }
                if (r !== "") {
                    if (Object.prototype.hasOwnProperty.call(m, r)) {
                        if (r.charCodeAt(0) < 256) {
                            for (h = 0; h < b; h++) {
                                a = (a << 1);
                                if (j == 15) {
                                    j = 0;
                                    q += k(a);
                                    a = 0
                                } else {
                                    j++
                                }
                            }
                            l = r.charCodeAt(0);
                            for (h = 0; h < 8; h++) {
                                a = (a << 1) | (l & 1);
                                if (j == 15) {
                                    j = 0;
                                    q += k(a);
                                    a = 0
                                } else {
                                    j++
                                }
                                l = l >> 1
                            }
                        } else {
                            l = 1;
                            for (h = 0; h < b; h++) {
                                a = (a << 1) | l;
                                if (j == 15) {
                                    j = 0;
                                    q += k(a);
                                    a = 0
                                } else {
                                    j++
                                }
                                l = 0
                            }
                            l = r.charCodeAt(0);
                            for (h = 0; h < 16; h++) {
                                a = (a << 1) | (l & 1);
                                if (j == 15) {
                                    j = 0;
                                    q += k(a);
                                    a = 0
                                } else {
                                    j++
                                }
                                l = l >> 1
                            }
                        }
                        d--;
                        if (d == 0) {
                            d = Math.pow(2, b);
                            b++
                        }
                        delete m[r]
                    } else {
                        l = n[r];
                        for (h = 0; h < b; h++) {
                            a = (a << 1) | (l & 1);
                            if (j == 15) {
                                j = 0;
                                q += k(a);
                                a = 0
                            } else {
                                j++
                            }
                            l = l >> 1
                        }
                    }
                    d--;
                    if (d == 0) {
                        d = Math.pow(2, b);
                        b++
                    }
                }
                l = 2;
                for (h = 0; h < b; h++) {
                    a = (a << 1) | (l & 1);
                    if (j == 15) {
                        j = 0;
                        q += k(a);
                        a = 0
                    } else {
                        j++
                    }
                    l = l >> 1
                }
                while (true) {
                    a = (a << 1);
                    if (j == 15) {
                        q += k(a);
                        break
                    } else {
                        j++
                    }
                }
                return q
            },
            };
            console.log(LZString.compressToBase64(cc));
        """
        return js_str1 + js_str2

    def start(self):
        with open('wipo_js.js', 'w', encoding='utf-8') as ww:
            js_str = self.get_js_str()
            ww.write(js_str)
        res = os.popen('node wipo_js.js').read().replace('\n', '')
        print(res)
        return res


if __name__ == '__main__':
    get_wipo_js = GetWipoJS()
    get_wipo_js.start()

wipo_spider.py

import requests
import re
import time
from tools import get_ua
from wipo_js import GetWipoJS

replaces = lambda x: f'{x}'.replace('\n', '').strip()


class MailRuPC(object):
    def __init__(self):
        self.qk = self.get_qk()
        self.get_wipo_js = GetWipoJS()

    def get_cookies(self):
        pass

    def get_headers(self):
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': 'https://www3.wipo.int/branddb/en/',
            'Origin': 'https://www3.wipo.int',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': get_ua(),
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        }
        return headers

    def get_qk(self):
        qi = ''
        # cookies = {
        #     '_pk_id.9.ec75': '12daa01499626e0d.1639492972.1.1639492972.1639492972.',
        #     '_pk_ses.9.ec75': '1',
        # }
        headers = {
            'User-Agent': get_ua(),
        }
        response = requests.get('https://www3.wipo.int/branddb/en/', headers=headers)
        text = response.text
        qis = re.findall('qk = "(.*?)"', text)
        if qis:
            if len(qis) == 2:
                if len(qis[1]) == 44:
                    qi = qis[1]
        return qi

    def get_with_proxy(self, dd, timeout=60):

        target_url = dd.get('target_url')
        headers = dd.get('headers')
        data = dd.get('data')

        response = requests.post(url=target_url, headers=headers, data=data, timeout=timeout)
        # response.encoding = response.apparent_encoding
        encoding = ''.join(re.findall('charset=(.*)', response.headers.get('Content-Type')))
        response.encoding = encoding
        json_obj = response.json()
        code = response.status_code
        return code, json_obj

    def parse_all(self, json_obj):
        result = []
        qi = ''
        try:
            qi = json_obj.get('qi')
            details = json_obj.get('response', {}).get('docs')
            for detail in details:
                item = {}
                item['Brand'] = detail.get('BRAND')
                item['Source'] = detail.get('SOURCE')
                item['Status'] = detail.get('STATUS')
                item['Relevance'] = detail.get('score')
                item['Origin'] = detail.get('OO')
                item['Holder'] = detail.get('HOL')
                item['HolderCountry'] = detail.get('HOLC')
                item['Number'] = detail.get('ID')
                item['AppDate'] = detail.get('AD')
                item['Imageclass'] = detail.get('IMGC')
                item['NiceCl'] = detail.get('NC')
                item['Image'] = detail.get('IMG')  # 08/47/M11580847-th.jpg
                result.append(item)
        except Exception as e:
            msg = 'func parse_all error:%s' % repr(e)
            print(msg)
        return result, qi

    def control(self, qi, page):
        target_url = 'https://www3.wipo.int/branddb/jsp/select.jsp'
        qz = self.get_wipo_js.get_qz(qi=qi, queue=page)
        dd = {
            'target_url': target_url,
            'headers': self.get_headers(),
            'data': {'qz': qz},
        }
        code, json_obj = self.get_with_proxy(dd)
        if json_obj:
            print(json_obj)
            result, qi = self.parse_all(json_obj)
            print(result)
            print(f'qi:{qi}')
            return result, qi

    def start(self):
        print(f'qk:{self.qk}')
        qi = '0-' + self.qk
        for page in range(1, 2 + 1):
            print(f'page:{page}')
            result, qi = self.control(qi, page)
            time.sleep(10)


if __name__ == '__main__':
    mail_ru = MailRuPC()
    # crawl_time = time.strftime('%Y-%m-%d', time.localtime())
    result = mail_ru.start()
上一篇下一篇

猜你喜欢

热点阅读