A-A+
python爬取反爬小说网站内容
打开网址,F12。发现网页直接跳转了~
那么,就新建一个标签页,先F12,设置,禁用JS
再次粘贴网址,跳转。对比一下网页的文字和html源代码中的文字。
下一步确认一下文字动态还是静态
按照上面的方法,再次新建一个标签页,重新打开网址,发现是两次源代码的文字是不一样的。确认是动态加载!
并且字体为base64编码,提取出来,保存为TTF文件!
这里推荐一个查字体的网站:https://www.bejson.com/ui/font/
打开网站后,上传字体文件。
分析到这里,大致的思路已经出来了~
读取网页源代码->提取base64编码的字体文件->取出来字体文件中的字符图片以及真实的Unicode字符->识别图片中的文字->按照规则进行文章的文字替换
取出来字体文件中的字符图片以及真实的Unicode字符
现在的目的达到了,接下来就是识别这个字符的字,然后把文章的"要"替换为"发"
现在软件可以通过uni8981查询到这个字是"要",但是软件是不知道这个字代表的是"发"
所以,我的思路就是使用图像识别
import base64
import re
import bs4
import easyocr
import requests
from fontTools.ttLib import TTFont
from PIL import Image, ImageDraw, ImageFont
import os
def ocr(folder_dir: str) -> dict:
reader = easyocr.Reader(['ch_sim'], gpu=False)
key_words = {}
directory = folder_dir
dir_list = os.listdir(directory)
for file_name in dir_list:
full_path = os.path.join(directory, file_name)
# print(full_path)
error_key_word = f'\\{file_name}'.encode('utf-8').decode('unicode_escape').replace('.png', '')
try:
success_key_word = reader.readtext(full_path, detail=False)[0]
except IndexError:
success_key_word = '一'
key_words.update({error_key_word: success_key_word})
return key_words
def get_html():
url = "https://www.zhihu.com/market/paid_column/1730607810226688000/section/1730181148968325120"
payload = {}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
response = requests.request("GET", url, headers=headers, data=payload)
return response.text
def html_analyze(html_body: str):
html_obj = bs4.BeautifulSoup(html_body, features="html.parser")
return '\n'.join([item.get_text() for item in html_obj.findAll('p')])
def clear_directory(path):
for filename in os.listdir(path):
filepath = os.path.join(path, filename)
try:
if os.path.isfile(filepath):
os.remove(filepath)
except Exception as e:
print(f"Error deleting {filepath}: {e}")
def export_glyph_images(ttf_file_path, output_folder, image_size=(200, 200), font_size=200):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
font = TTFont(ttf_file_path)
cmap = font.getBestCmap()
pil_font = ImageFont.truetype(ttf_file_path, font_size)
for unicode_char, glyph_name in cmap.items():
char = chr(unicode_char)
img = Image.new("RGBA", image_size, color="white")
draw = ImageDraw.Draw(img)
# 绘制字符
draw.text((0, 0), char, font=pil_font, fill="black", font_size=24, align="center")
# 保存图片
unicode_hex_str = f'u{unicode_char:04X}' + ".png"
print(unicode_hex_str)
image_file_path = os.path.join(output_folder, unicode_hex_str)
img.save(image_file_path)
font.close()
if __name__ == '__main__':
# 字体缓存地址
output_folder = r"xxxxxxx"
clear_directory(output_folder)
html_body = get_html()
font_face_regex = re.findall(r'@font-face\s*{([^}]*)}', html_body)
base64_string = re.findall('base64,(.*?)\);', font_face_regex[3])[0]
binary_data = base64.b64decode(base64_string)
with open('zhihu.ttf', 'wb') as f:
f.write(bytes(binary_data))
# 指定TTF文件的路径
ttf_file_path = "zhihu.ttf"
# 调用函数导出字形图片
export_glyph_images(ttf_file_path, output_folder)
keywords = ocr(output_folder)
content = [item for item in html_analyze(html_body)]
filter_list = []
for error_key, success_key in keywords.items():
for index in range(len(content)):
if content[index] == error_key and index not in filter_list:
content[index] = success_key
filter_list.append(index)
with open('知乎盐选.txt', 'w', encoding='utf-8') as f:
f.write(''.join(content))
文章来源:https://www.52pojie.cn/thread-1911970-1-1.html
布施恩德可便相知重
微信扫一扫打赏
支付宝扫一扫打赏