应对字体加密的反爬
2019-05-22 本文已影响0人
WangLane
快手页面
https://live.kuaishou.com/profile/maomei527
字体反爬
我们F12打开Dev Tools,在response中找到数据的部分可以看到
对应网页中的粉丝数部分:
页面返回的是这样的东西
显然,是字体加密。
分析
我们直接搜索woff,直接找找看有没有字体文件。然后就在页面中找到了一个链接,我们把这个woff文件下载到本地,用fontcreater打开,可以看到
考虑到每次字体的关系都是动态加载的,我们要写一个解析的模块。
这里我们使用fonttools模块,pip安装一下
pip install fonttools
我们把刚刚下载的字体文件转成xml文档,这样方便查看
from fontTools.ttLib import TTFont
font = TTFont('fonts.woff')
font.saveXML('fonts.xml')
打开xml我们在glyf标签下找到映射关系:
下载几个字体文件,通过对比glyf内容发现,字体的绘制数据是不变的,就是说,我们只需要保存glyf绘制对应的数字就可以,然后根据每次网站返回的woff字体文件的glyf映射去判断是哪个数字就可以了。看了下每个数字对应的最大最小坐标,发现是没有四个值都重复的。
所以,简单的判断一下就行了。
fonttools模块
建议以下的内容开个ipython,方便看到输出
导入
>>> from fontTools.ttLib import TTFont
>>> font = TTFont('fonts.woff')
>>> font
<fontTools.ttLib.ttFont.TTFont at 0x19c3a4e7b70>
>>> glyf = font.get('glyf')
>>> glyf
<'glyf' table at 19c3a567e80>
>>> glyf.glyphs
{'.notdef': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a567b00>,
'glyph1': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a567da0>,
'nonmarkingreturn': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a56b780>,
'uni0001': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cda20>,
'space': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cd828>,
'uniABCE': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cd400>,
'uniACCD': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cd080>,
'uniAEDA': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cde80>,
'uniAEFE': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cd0f0>,
'uniAFED': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cdd68>,
'uniBAAA': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cde48>,
'uniBDDD': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cdbe0>,
'uniBFAD': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cd5f8>,
'uniBFAE': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cdb00>,
'uniC44F': <fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cd358>}
>>> c = glyf.glyphs.get('uniACCD')
>>> c
<fontTools.ttLib.tables._g_l_y_f.Glyph at 0x19c3a4cd080>
>>> c.getCoordinates(glyf)
(GlyphCoordinates([(467, 670),(514, 608),(514, 542),(425, 542),(417, 584),(400, 608),(367, 653),(301, 653),(226, 653),(182, 584),(137, 514),(132, 384),(163, 429),(210, 452),(253, 472),(306, 472),(396, 472),(463, 415),(530, 357),(530, 243),(530, 145),(467, 70),(403, -5),(285, -5),(185, -5),(112, 71),(39, 147),(39, 328),(39, 461),(71, 554),(134, 732),(300, 732),(420, 732),(400, 123),(435, 170),(435, 235),(435, 290),(404, 340),(372, 390),(289, 390),(231, 390),(187, 352),(143, 313),(143, 235),(143, 167),(183, 121),(223, 75),(293, 75),(364, 75)]),
[32, 48],
array('B', [1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]))
>>> c.yMax
732
>>> c.xMax
530
>>> c.yMin
-5
>>> c.xMin
35
撸代码
这四个值直接保存下来,每次对比就知道是哪个数字了。
for i in glyf.keys():
if i.startswith('uni'):
c = glyf[i]
print('{} ({},{},{},{})'.format(i, c.yMax, c.xMax, c.yMin, c.xMin))
"""
uni0001 (0, 0, 0, 0)
uniABCE (731, 536, 13, 26)
uniACCD (732, 530, -5, 39)
uniAEDA (731, 525, -7, 33)
uniAEFE (729, 526, -6, 32)
uniAFED (730, 525, -6, 25)
uniBAAA (717, 526, -5, 33)
uniBDDD (726, 363, 13, 98)
uniBFAD (732, 527, 13, 32)
uniBFAE (730, 521, -7, 37)
uniC44F (717, 536, 13, 38)
"""
然后对照fontcreator中的数字,写个映射字典,顺便把程序也写出来。
import os
import requests
import re
import json
from fontTools.ttLib import TTFont
font_map = {
(0, 0, 0, 0): ' ',
(729, 526, -6, 32): '0',
(726, 363, 13, 98): '1',
(732, 527, 13, 32): '2',
(730, 525, -6, 25): '3',
(731, 536, 13, 26): '4',
(717, 526, -5, 33): '5',
(732, 530, -5, 39): '6',
(717, 536, 13, 38): '7',
(731, 525, -7, 33): '8',
(730, 521, -7, 37): '9',
}
def decrypt_font(charater, font):
s = charater.encode('unicode_escape').decode().strip('\\').upper().strip('U')
res = font_map.get(s)
return res if res else charater
def create_mapping(font_file):
""" 打开字体文件并创建字符和数字之间的映射. """
# 打开字体文件,加载glyf
font = TTFont(font_file)
glyf = font.get('glyf')
current_map = {}
# 创建当前字体文件的数字映射
for i in glyf.keys():
# 忽略不是uni开头的字符
if not i.startswith('uni'):
continue
c = glyf[i]
number = font_map.get((c.yMax, c.xMax, c.yMin, c.xMin))
# 发现有字符不在已有的集合中, 抛出异常.
if number is None:
raise Exception
current_map[i.strip('uni')] = number
print(json.dumps(current_map, indent=4))
return current_map
def decrypt_str(s):
res = ''
for c in s:
res = res + decrypt_font(c)
return res
def get_mapping(page):
m = re.search('(http.*?.woff)', page)
if m:
woff_link = m.group(1)
headers1 = {
'Host': "static.yximgs.com",
'Connection': "keep-alive",
'Upgrade-Insecure-Requests': "1",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9",
'Cache-Control': "no-cache",
'Postman-Token': "0d97a0b9-6e4d-4eb4-8192-8cf365f77ef6,ca4b1512-7916-4238-8c94-7b1afb3fad56",
'cache-control': "no-cache"
}
woff_res = requests.get(woff_link, headers=headers1)
file_name = woff_link.split('/')[-1]
with open(file_name, 'wb') as f:
f.write(woff_res.content)
mapping = create_mapping(file_name)
os.remove(file_name)
return mapping
def get_page(url):
headers = {
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9",
'Cache-Control': "max-age=0",
'Connection': "keep-alive",
'Host': "live.kuaishou.com",
'Referer': "https://live.kuaishou.com/search/?keyword=%23%E7%BE%8E%E9%A3%9F",
'Upgrade-Insecure-Requests': "1",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
'Postman-Token': "9c4df31c-8cbd-490e-8625-0acc68aaf99f,a582159e-df9e-4d84-8788-928e982783f0",
'cache-control': "no-cache"
}
r = requests.get(url, headers=headers)
return r
if __name__ == '__main__':
url = 'https://live.kuaishou.com/profile/maomei527'
r = get_page(url)
get_mapping(r.text)
对照一下字体文件的内容看看是否正确:
全部代码
加上自动下载字体和字体转换.
测试中发现还会出现另外一套映射,对比之前的映射发现只是y轴上进行了13个单位的位移,于是加了一个x,y轴的位移判断,最终的程序如下:
import os
import requests
import re
import json
from fontTools.ttLib import TTFont
font_map = {
(0, 0, 0, 0): ' ',
(729, 526, -6, 32): '0',
(726, 363, 13, 98): '1',
(732, 527, 13, 32): '2',
(730, 525, -6, 25): '3',
(731, 536, 13, 26): '4',
(717, 526, -5, 33): '5',
(732, 530, -5, 39): '6',
(717, 536, 13, 38): '7',
(731, 525, -7, 33): '8',
(730, 521, -7, 37): '9',
}
def decrypt_font(charater, mapping):
""" 解密单个字符,如果可以解密就输出解密后的数字,否则原样返回"""
s = charater.encode('unicode_escape').decode().strip('\\').upper().strip('U')
res = mapping.get(s)
return res if res else charater
def get_number_offset(c, max_offset=20):
""" 根据偏移量计算映射出来的数字 """
number = None
# i, j 分别代表y和x的偏移量
for i in range(max_offset+1):
for j in range(max_offset+1):
# 正向偏移
number = font_map.get((c.yMax+i, c.xMax+j, c.yMin+i, c.xMin+j))
if number:
# print('offset x:{} y:{}'.format(i,j))
return number
# 负向偏移
number = font_map.get((c.yMax-i, c.xMax-j, c.yMin-i, c.xMin-j))
if number:
# print('offset x:{} y:{}'.format(i,j))
return number
return number
def create_mapping(font_file):
""" 打开字体文件并创建字符和数字之间的映射. """
# 打开字体文件,加载glyf
font = TTFont(font_file)
glyf = font.get('glyf')
current_map = {}
# 创建当前字体文件的数字映射
for i in glyf.keys():
# 忽略不是uni开头的字符
if not i.startswith('uni'):
continue
c = glyf[i]
number = get_number_offset(c)
# 发现有字符不在已有的集合中, 抛出异常.
if number is None:
print((c.yMax, c.xMax, c.yMin, c.xMin))
raise Exception
current_map[i.strip('uni')] = number
print(json.dumps(current_map, indent=4))
return current_map
def decrypt_str(s, mapping):
""" 解密字符串, 不需要解密的部分原样返回 """
res = ''
for c in s:
res = res + decrypt_font(c, mapping)
return res
def get_mapping(page):
m = re.search('(http.*?.woff)', page)
if m:
woff_link = m.group(1)
headers1 = {
'Host': "static.yximgs.com",
'Connection': "keep-alive",
'Upgrade-Insecure-Requests': "1",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9",
'Cache-Control': "no-cache",
'Postman-Token': "0d97a0b9-6e4d-4eb4-8192-8cf365f77ef6,ca4b1512-7916-4238-8c94-7b1afb3fad56",
'cache-control': "no-cache"
}
woff_res = requests.get(woff_link, headers=headers1)
file_name = woff_link.split('/')[-1]
with open(file_name, 'wb') as f:
f.write(woff_res.content)
mapping = create_mapping(file_name)
os.remove(file_name)
return mapping
def get_page(url):
headers = {
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9",
'Cache-Control': "max-age=0",
'Connection': "keep-alive",
'Host': "live.kuaishou.com",
'Referer': "https://live.kuaishou.com/search/?keyword=%23%E7%BE%8E%E9%A3%9F",
'Upgrade-Insecure-Requests': "1",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
'Postman-Token': "9c4df31c-8cbd-490e-8625-0acc68aaf99f,a582159e-df9e-4d84-8788-928e982783f0",
'cache-control': "no-cache"
}
r = requests.get(url, headers=headers)
return r
if __name__ == '__main__':
url = 'https://live.kuaishou.com/profile/maomei527'
r = get_page(url)
for i in range(5):
try:
mapping = get_mapping(r.text)
break
except:
pass
raw_s = re.search('"fan"\s*:\s*"(.*?)"', r.text).group(1)
print(raw_s)
print(decrypt_str(raw_s, mapping))
"""
{
"0001": " ",
"ABCF": "4",
"ACED": "3",
"AEDD": "8",
"AEDE": "0",
"AFCD": "6",
"BDAA": "5",
"BDCD": "1",
"BFAD": "9",
"CCDA": "2",
"CFBE": "7"
}
쳚곭꿍곭.뷍w
2363.1w
"""
"""
{
"0001": " ",
"ABCB": "4",
"ACCD": "3",
"ACDA": "0",
"AEFF": "8",
"AFBB": "6",
"BDCA": "1",
"BDCC": "5",
"BFEF": "9",
"CCAA": "2",
"CFBA": "7"
}
첪곍꾻곍.뷊w
2363.1w
"""