A-A+

python爬取反爬小说网站内容

2024年04月19日 08:50 汪洋大海 暂无评论 共3686字 (阅读290 views次)

打开网址,F12。发现网页直接跳转了~

那么,就新建一个标签页,先F12,设置,禁用JS

再次粘贴网址,跳转。对比一下网页的文字和html源代码中的文字。

images

 

下一步确认一下文字动态还是静态

按照上面的方法,再次新建一个标签页,重新打开网址,发现是两次源代码的文字是不一样的。确认是动态加载!

images

images

 

并且字体为base64编码,提取出来,保存为TTF文件!

这里推荐一个查字体的网站:https://www.bejson.com/ui/font/

打开网站后,上传字体文件。

images

 

分析到这里,大致的思路已经出来了~

读取网页源代码->提取base64编码的字体文件->取出来字体文件中的字符图片以及真实的Unicode字符->识别图片中的文字->按照规则进行文章的文字替换

 

取出来字体文件中的字符图片以及真实的Unicode字符

images

images

 

现在的目的达到了,接下来就是识别这个字符的字,然后把文章的"要"替换为"发"

现在软件可以通过uni8981查询到这个字是"要",但是软件是不知道这个字代表的是"发"

所以,我的思路就是使用图像识别

 

import base64
import re
 
import bs4
import easyocr
import requests
from fontTools.ttLib import TTFont
from PIL import Image, ImageDraw, ImageFont
import os
 
 
def ocr(folder_dir: str) -> dict:
    reader = easyocr.Reader(['ch_sim'], gpu=False)
    key_words = {}
    directory = folder_dir
    dir_list = os.listdir(directory)
    for file_name in dir_list:
        full_path = os.path.join(directory, file_name)
        # print(full_path)
        error_key_word = f'\\{file_name}'.encode('utf-8').decode('unicode_escape').replace('.png', '')
        try:
            success_key_word = reader.readtext(full_path, detail=False)[0]
        except IndexError:
            success_key_word = '一'
        key_words.update({error_key_word: success_key_word})
    return key_words
 
 
def get_html():
    url = "https://www.zhihu.com/market/paid_column/1730607810226688000/section/1730181148968325120"
 
    payload = {}
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
        'cache-control': 'no-cache',
        'pragma': 'no-cache',
        'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
    }
 
    response = requests.request("GET", url, headers=headers, data=payload)
 
    return response.text
 
 
def html_analyze(html_body: str):
    html_obj = bs4.BeautifulSoup(html_body, features="html.parser")
    return '\n'.join([item.get_text() for item in html_obj.findAll('p')])
 
 
def clear_directory(path):
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        try:
            if os.path.isfile(filepath):
                os.remove(filepath)
        except Exception as e:
            print(f"Error deleting {filepath}: {e}")
 
 
def export_glyph_images(ttf_file_path, output_folder, image_size=(200, 200), font_size=200):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
 
    font = TTFont(ttf_file_path)
    cmap = font.getBestCmap()
    pil_font = ImageFont.truetype(ttf_file_path, font_size)
 
    for unicode_char, glyph_name in cmap.items():
        char = chr(unicode_char)
        img = Image.new("RGBA", image_size, color="white")
        draw = ImageDraw.Draw(img)
 
        # 绘制字符
        draw.text((0, 0), char, font=pil_font, fill="black", font_size=24, align="center")
        # 保存图片
        unicode_hex_str = f'u{unicode_char:04X}' + ".png"
        print(unicode_hex_str)
        image_file_path = os.path.join(output_folder, unicode_hex_str)
        img.save(image_file_path)
 
    font.close()
 
 
if __name__ == '__main__':
    # 字体缓存地址
    output_folder = r"xxxxxxx"
 
    clear_directory(output_folder)
    html_body = get_html()
    font_face_regex = re.findall(r'@font-face\s*{([^}]*)}', html_body)
    base64_string = re.findall('base64,(.*?)\);', font_face_regex[3])[0]
    binary_data = base64.b64decode(base64_string)
    with open('zhihu.ttf', 'wb') as f:
        f.write(bytes(binary_data))
    # 指定TTF文件的路径
    ttf_file_path = "zhihu.ttf"
    # 调用函数导出字形图片
    export_glyph_images(ttf_file_path, output_folder)
 
    keywords = ocr(output_folder)
    content = [item for item in html_analyze(html_body)]
 
    filter_list = []
    for error_key, success_key in keywords.items():
        for index in range(len(content)):
            if content[index] == error_key and index not in filter_list:
                content[index] = success_key
                filter_list.append(index)
 
    with open('知乎盐选.txt', 'w', encoding='utf-8') as f:
        f.write(''.join(content))

文章来源:https://www.52pojie.cn/thread-1911970-1-1.html

布施恩德可便相知重

微信扫一扫打赏

支付宝扫一扫打赏

×

给我留言