知乎的字体加密 – lemon

获取到base64字符串并删除头部信息，在这里就是data:font/opentype;base64, 逗号也要删除，这样就获取到了字体信息。比如：T1RUTwAJAIAAAwAQQ0ZGIBcEq......过长不展示。
访问http://www.motobit.com/util/base64-decoder-encoder.asp 这个网站，将纯字体信息字符串粘贴进编辑区域，然后下方解码选项选择解码base64字符串和导出为二进制文件，即：

网站可解码base64并存储为二进制数据的 .bin 文件，将下载的二进制数据的 .bin 文件后缀更改为 .ttf
百度字体编辑器FontEditor (kekee000.github.io)
参考博客：
https://www.likecs.com/show-308355464.html
http://e.betheme.net/article/show-157263.html?action=onClick
# -*- coding: UTF-8 -*-
"""=========================================================
@Project -> File: aiqicSpider -> zhihu
@IDE: PyCharm
@author: lxc
@date: 2023/5/25 下午 4:04
@Desc:
1-功能描述：

2-实现步骤
    1-
"""
import os
import datetime
from w3lib import html
import re
from lxml import etree
import requests
from utils.util import *
from fontTools.ttLib import TTFont
import xml.etree.ElementTree as ET
import logging
import base64
logger = logging.getLogger('log')

def get_new_string(string):
    """
    解字体加密的流程
    :param string:字体加密的内容
    :return: 正确的内容
    """
    filePath = 'source/zhihuxiaoshuo_decode.xml'
    tree = ET.parse(filePath)
    # 获取所有混淆字符（51个）
    # GlyphOrder = [t.get('name') for t in tree.findall('./GlyphOrder/GlyphID')]
    # 获取映射关系字典
    version = tree.find('./cmap/tableVersion').get('version')
    map_trees = tree.findall(f'./cmap/cmap_format_4[@platformID="{version}"]/map')
    font_dict = {(r'\u' + map_tree.get('code')[2:]): (r'\u' + map_tree.get('name')[3:]) for map_tree in map_trees}
    # 开始转换成正确的Unicode编码(仅转换含在51个混淆字符内的文字)
    new_string = ''
    for s in string:
        unicode_s = zh_to_unicode(s)
        new_unicode_s = font_dict.get(unicode_s, '') if font_dict.get(unicode_s, '') else unicode_s
        new_string += new_unicode_s
    new_string = unicode_to_zh(new_string)
    return new_string

def base64_to_xml(base64str, filePath = 'source/zhihuxiaoshuo_decode.ttf'):
    # 存储的ttf文件路径及名称
    filePath = filePath
    # 字体文件的base64
    # 存储文件（以二进制方式）
    # filePath ：存储的文件路径及名称
    # data ：需要存储的数据
    with open(filePath, 'wb') as f:
        f.write(base64.b64decode(base64str))
    font = TTFont(filePath)
    font.saveXML(filePath.replace('.ttf', '.xml'))
def get_content_font(url=''):
    headers = {
        "Host": "www.zhihu.com",
        "cache-control": "max-age=0",
        "sec-ch-ua": "\"Not.A/Brand\";v=\"8\", \"Chromium\";v=\"114\", \"Microsoft Edge\";v=\"114\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "sec-fetch-site": "none",
        "sec-fetch-mode": "navigate",
        "sec-fetch-user": "?1",
        "sec-fetch-dest": "document",
        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "x-forwarded-for": "4.2.2.2"
    }
    cookies = {
        "_zap": "a83827de-3b91-4fcb-8cac-b05c0ce2c722",
        "d_c0": "AWBYhnhPphaPTjRW3tDn7abGKTptvpsXWg4=|1681898026",
        "YD00517437729195%3AWM_TID": "tlc%2B8RbWlU1AEVVAUBKEfswCf0D76f6G",
        "YD00517437729195%3AWM_NI": "1a48fqyfOV1Qek9xh7T6V2KYMug3eGp7rlxL%2B5wSQWZ7NWDVzL7Jbx%2BE3Ot54j5HRGPLDmfMEFCfsl84CRZR7dlymKR7lcyBszRqRaMzx04uYRMAwmsSykSvix7DGFOrVlk%3D",
        "YD00517437729195%3AWM_NIKE": "9ca17ae2e6ffcda170e2e6ee8ac2688ef1feccdc799cb48ea3d85b828a9facc53c8888a7b9c739a9bc99d8d62af0fea7c3b92af3bffcd9e57bbbba81b7b45ba7b5f799eb3d98ea9ad3ca538faba9d4e84db4e9be8dfb48e9ebb9b4d45d9ae982b8c75eba94f9abe174e99bfdb0f73ff8b6fba9d14d94948c8ee547879a8d87f268fcec8db3d95af5bca49af04192e9a484fc4ab1a69e92d16b8fa6b8d1b77f9b8e83a2f667b18db78cd27eaba8f9a5c64e8dae9a8dea37e2a3",
        "__snaker__id": "iUG6Tku5N70vX1Vy",
        "gdxidpyhxdE": "ndOZ%2BuniZlLfR3P74HSwhp9%2FtNw7eCBbuae3759jtbki87e9StUKEJR75RRNsGUGzp2bJVKHg132UKnp%2FzuKC%5Cb7v%5C718AqQDwm0PSa0tDayzE7K4kcjlnMlTpaE3woJkn4pS1%2BYAamHHQlYkjz%2F9fMkiBHkcCuj1%2FzS9pWgsqEB594m%3A1684985764828",
        "captcha_session_v2": "2|1:0|10:1684984938|18:captcha_session_v2|88:WXhPY0hIQ2RVTEI4Qi91TTV0dGQ4a25GTU13L1R4ZHI5ZVc4Zkd5MFZieHhvTVVqdTY1QjA3cHlORDdGUW05MA==|15966ce0b32650122e3711c7951a753abff1705e19797653ae407991ad84cd1f",
        "captcha_ticket_v2": "2|1:0|10:1684984948|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfdnJlUXdUQUR0VUd6TkhpbnAtSGEtLkJlUDIxeFlTQjRxa2tzNV9HdWJlcEVneGhnanZEeVJIaF9GUmFEWm5QVlJxRFlrWEFVa0NKcUtyT3NIaGYxQk55U3NibS4uQXZzYlhXOHJYeTE2b0EuQzZwYlg2UC5IQ21yUXdMd3dPZnpDZE9jeTRHOTU5TFJUQWdVNkdHNmJpUWFrODZBQ2tFS1EyOVFlVmlyR2NfSDhZLmRINEdiUmljRHZzY2MuOHE5RlQxcnZoVjhoUER5WG0uVFc2QlQxLk5ERTlqRDBQSWRYbG8tUWJUTVZZRTFCOElaU0Z6VWluSFZNNW9mWWJHQjBzZ2VjWWxNVlJOUGROQzVNSzBmVWlwWEdaQVBpLnFHUnpQV0NRalB1MXZsWjlyWjd0b0NKOUd3eHBDNFp3cGxUNlNjVnJBV3ZZRGZJdl9vLjQ0TlVSODRwOENURDhvaTFrbnN0UXVaQW9ReV9SLlctVFQ2dm00V1ZqcWg3aWNsTXlmWVNaVTlWclQ3MHdBQ2tNaVdoQ0h0cWNfMUVTQUxkZXpxZS1tODV6S25zbVB3cWxjck1tYmJfQjFwTEZ2TXNDNXVmeDgta3ppY05XRWFIZUQyUWFKZTB1NXhRSGxYWVdPQ25nV29LcTAyVXdKX1ZNU2RHSk5mLjlwMyJ9|2177d12942c4eff4c1b0baa54e8134edbdf1813ae1b86c35e82cc9dc1b90ba72",
        "q_c1": "8dd23ed357234c70bd90121c81245e15|1684985081000|1684985081000",
        "tst": "r",
        "z_c0": "2|1:0|10:1684985158|4:z_c0|92:Mi4xLWpYUVJnQUFBQUFCWUZpR2VFLW1GaVlBQUFCZ0FsVk4tU0pjWlFERUtuZHFSTUV1UlBaSWlmTVpJbDBSbWRqUjl3|ddb60321ede5e0f5d84736a7f387bd2c147421df927de42add63619add23dc2c",
        "_xsrf": "IzgYPBKsxbRtg2EWChUJjfXmPoSDHCZG",
        "Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49": "1684907499,1684984858,1684994068,1685005668",
        "KLBRSID": "c450def82e5863a200934bb67541d696|1685033161|1685033150"
    }
    params = {
        "is_share_data": "true",
        "vp_share_title": "0"
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params)
    tree = etree.HTML(response.text)
    title = ''.join(tree.xpath('//*[@id="app"]/div/h1/text()'))
    content_tree = tree.xpath("//div[@id='manuscript']")
    doc = etree.tostring(content_tree[0])
    html_content = html.remove_tags(doc)
    content = html.replace_entities(html_content, keep=('£'))
    base64_string = ''.join(re.findall('src: url\(data:font/ttf;charset=utf-8;base64,(.*?)\), url', response.text))
    return title, content, base64_string
def zhxs_main(url=''):
    # 获取正文和字体
    try:
        title, content, base64_string = get_content_font(url)
    except Exception as e:
        logger.exception(e)
        logger.error("采集：获取正文和字体失败！！！\n 【%s】" % url)
        return
    # 存储字体文件
    try:
        base64_to_xml(base64_string)
    except Exception as e:
        logger.exception(e)
        logger.error("存储字体文件失败！！！\n 【%s】" % url)
        return
    # 获取正确内容
    try:
        new_content = get_new_string(content)
    except Exception as e:
        logger.exception(e)
        logger.error("获取正确内容失败！！！\n 【%s】" % url)
        return
    # 存储txt文件
    try:
        file_path = 'output/' + str(datetime.datetime.now())[:10]
        if not os.path.exists(file_path):
            os.mkdir(file_path)
        file_name = file_path + '/' + title + '.txt'
        with open(file_name, 'w', encoding='utf-8') as f:
            f.write(new_content)
    except Exception as e:
        logger.exception(e)
        logger.error("存储txt文件失败！！！\n 【%s】" % url)
        return
    return file_name

if __name__ == '__main__':
    url = 'https://www.zhihu.com/market/paid_column/1575901064246743040/section/1578384160636489728'
    zhxs_main(url)
发表回复 取消回复

发表回复取消回复