采集wipo
2021-12-13 本文已影响0人
是东东
wipo_js.py
import os
class GetWipoJS(object):
def _getCurrentState(self):
dic = {
"type": "brand",
"la": "en",
"qi": "0-1OOf/PeepQuegNdjyPRWQP8uO1YSPDMK7DnyHlO6O/8=",
"queue": 1,
"_": "11569",
}
return dic
def get_js_str(self):
js_str1 = 'var cc = %s;' % self._getCurrentState()
js_str2 = """
var LZString = {
_keyStr: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",
_f: String.fromCharCode,
compressToBase64: function(cc) {
var c = JSON.stringify(cc);
if (c == null) {
return ""
}
var a = "";
var k, h, f, j, g, e, d;
var b = 0;
c = LZString.compress(c);
while (b < c.length * 2) {
if (b % 2 == 0) {
k = c.charCodeAt(b / 2) >> 8;
h = c.charCodeAt(b / 2) & 255;
if (b / 2 + 1 < c.length) {
f = c.charCodeAt(b / 2 + 1) >> 8
} else {
f = NaN
}
} else {
k = c.charCodeAt((b - 1) / 2) & 255;
if ((b + 1) / 2 < c.length) {
h = c.charCodeAt((b + 1) / 2) >> 8;
f = c.charCodeAt((b + 1) / 2) & 255
} else {
h = f = NaN
}
}
b += 3;
j = k >> 2;
g = ((k & 3) << 4) | (h >> 4);
e = ((h & 15) << 2) | (f >> 6);
d = f & 63;
if (isNaN(h)) {
e = d = 64
} else {
if (isNaN(f)) {
d = 64
}
}
a = a + LZString._keyStr.charAt(j) + LZString._keyStr.charAt(g) + LZString._keyStr.charAt(e) + LZString._keyStr.charAt(d)
}
return a
},
compress: function(e) {
if (e == null) {
return ""
}
var h, l, n = {}, m = {}, o = "", c = "", r = "", d = 2, g = 3, b = 2, q = "", a = 0, j = 0, p, k = LZString._f;
for (p = 0; p < e.length; p += 1) {
o = e.charAt(p);
if (!Object.prototype.hasOwnProperty.call(n, o)) {
n[o] = g++;
m[o] = true
}
c = r + o;
if (Object.prototype.hasOwnProperty.call(n, c)) {
r = c
} else {
if (Object.prototype.hasOwnProperty.call(m, r)) {
if (r.charCodeAt(0) < 256) {
for (h = 0; h < b; h++) {
a = (a << 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
}
l = r.charCodeAt(0);
for (h = 0; h < 8; h++) {
a = (a << 1) | (l & 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = l >> 1
}
} else {
l = 1;
for (h = 0; h < b; h++) {
a = (a << 1) | l;
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = 0
}
l = r.charCodeAt(0);
for (h = 0; h < 16; h++) {
a = (a << 1) | (l & 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = l >> 1
}
}
d--;
if (d == 0) {
d = Math.pow(2, b);
b++
}
delete m[r]
} else {
l = n[r];
for (h = 0; h < b; h++) {
a = (a << 1) | (l & 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = l >> 1
}
}
d--;
if (d == 0) {
d = Math.pow(2, b);
b++
}
n[c] = g++;
r = String(o)
}
}
if (r !== "") {
if (Object.prototype.hasOwnProperty.call(m, r)) {
if (r.charCodeAt(0) < 256) {
for (h = 0; h < b; h++) {
a = (a << 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
}
l = r.charCodeAt(0);
for (h = 0; h < 8; h++) {
a = (a << 1) | (l & 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = l >> 1
}
} else {
l = 1;
for (h = 0; h < b; h++) {
a = (a << 1) | l;
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = 0
}
l = r.charCodeAt(0);
for (h = 0; h < 16; h++) {
a = (a << 1) | (l & 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = l >> 1
}
}
d--;
if (d == 0) {
d = Math.pow(2, b);
b++
}
delete m[r]
} else {
l = n[r];
for (h = 0; h < b; h++) {
a = (a << 1) | (l & 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = l >> 1
}
}
d--;
if (d == 0) {
d = Math.pow(2, b);
b++
}
}
l = 2;
for (h = 0; h < b; h++) {
a = (a << 1) | (l & 1);
if (j == 15) {
j = 0;
q += k(a);
a = 0
} else {
j++
}
l = l >> 1
}
while (true) {
a = (a << 1);
if (j == 15) {
q += k(a);
break
} else {
j++
}
}
return q
},
};
console.log(LZString.compressToBase64(cc));
"""
return js_str1 + js_str2
def start(self):
with open('wipo_js.js', 'w', encoding='utf-8') as ww:
js_str = self.get_js_str()
ww.write(js_str)
res = os.popen('node wipo_js.js').read().replace('\n', '')
print(res)
return res
if __name__ == '__main__':
get_wipo_js = GetWipoJS()
get_wipo_js.start()
wipo_spider.py
import requests
import re
import time
from tools import get_ua
from wipo_js import GetWipoJS
replaces = lambda x: f'{x}'.replace('\n', '').strip()
class MailRuPC(object):
def __init__(self):
self.qk = self.get_qk()
self.get_wipo_js = GetWipoJS()
def get_cookies(self):
pass
def get_headers(self):
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www3.wipo.int/branddb/en/',
'Origin': 'https://www3.wipo.int',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': get_ua(),
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
return headers
def get_qk(self):
qi = ''
# cookies = {
# '_pk_id.9.ec75': '12daa01499626e0d.1639492972.1.1639492972.1639492972.',
# '_pk_ses.9.ec75': '1',
# }
headers = {
'User-Agent': get_ua(),
}
response = requests.get('https://www3.wipo.int/branddb/en/', headers=headers)
text = response.text
qis = re.findall('qk = "(.*?)"', text)
if qis:
if len(qis) == 2:
if len(qis[1]) == 44:
qi = qis[1]
return qi
def get_with_proxy(self, dd, timeout=60):
target_url = dd.get('target_url')
headers = dd.get('headers')
data = dd.get('data')
response = requests.post(url=target_url, headers=headers, data=data, timeout=timeout)
# response.encoding = response.apparent_encoding
encoding = ''.join(re.findall('charset=(.*)', response.headers.get('Content-Type')))
response.encoding = encoding
json_obj = response.json()
code = response.status_code
return code, json_obj
def parse_all(self, json_obj):
result = []
qi = ''
try:
qi = json_obj.get('qi')
details = json_obj.get('response', {}).get('docs')
for detail in details:
item = {}
item['Brand'] = detail.get('BRAND')
item['Source'] = detail.get('SOURCE')
item['Status'] = detail.get('STATUS')
item['Relevance'] = detail.get('score')
item['Origin'] = detail.get('OO')
item['Holder'] = detail.get('HOL')
item['HolderCountry'] = detail.get('HOLC')
item['Number'] = detail.get('ID')
item['AppDate'] = detail.get('AD')
item['Imageclass'] = detail.get('IMGC')
item['NiceCl'] = detail.get('NC')
item['Image'] = detail.get('IMG') # 08/47/M11580847-th.jpg
result.append(item)
except Exception as e:
msg = 'func parse_all error:%s' % repr(e)
print(msg)
return result, qi
def control(self, qi, page):
target_url = 'https://www3.wipo.int/branddb/jsp/select.jsp'
qz = self.get_wipo_js.get_qz(qi=qi, queue=page)
dd = {
'target_url': target_url,
'headers': self.get_headers(),
'data': {'qz': qz},
}
code, json_obj = self.get_with_proxy(dd)
if json_obj:
print(json_obj)
result, qi = self.parse_all(json_obj)
print(result)
print(f'qi:{qi}')
return result, qi
def start(self):
print(f'qk:{self.qk}')
qi = '0-' + self.qk
for page in range(1, 2 + 1):
print(f'page:{page}')
result, qi = self.control(qi, page)
time.sleep(10)
if __name__ == '__main__':
mail_ru = MailRuPC()
# crawl_time = time.strftime('%Y-%m-%d', time.localtime())
result = mail_ru.start()