zhihu_spider/wechat.py

from selenium import webdriver
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import time
import os
from selenium.webdriver.edge.service import Service
from selenium.webdriver import EdgeOptions
import requests
import json
from selenium.webdriver.common.by import By

def crawlsleep(times):
    time.sleep(times)

def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
    if not innerHTML:
        return article, number
    if bk:
        article += "**"
    if isinstance(innerHTML, str):
        article += innerHTML.text
        return article, number

    for chi in innerHTML.children:
        # article, number = parser_beautiful(chi, article, number, dircrea, bk)
        tag_name = chi.name
        if isinstance(chi, str):
            article += chi.text
            continue
        else:
            cll = [c for c in chi.children]
        if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']:
            article, number = parser_beautiful(chi, article, number, dircrea, bk)
        elif tag_name=="br":
            article += "\n"
        elif tag_name=="p":
            article, number = parser_beautiful(chi, article, number, dircrea, bk)
            article += "\n\n"
        # elif tag_name=="br":
        #     article += "<br>\n"
        elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            article += '#' * int(tag_name[-1]) + ' '
            article, number = parser_beautiful(chi, article, number, dircrea, bk)
            article += '\n\n'
        elif tag_name=="span":
            datatex = None
            classc = None
            if 'data-tex' in chi.attrs.keys():
                datatex = chi.attrs["data-tex"]
            if 'class' in chi.attrs.keys():
                classc = chi.attrs["class"]
            if datatex and classc and 'ztext-math' in classc:
                content = chi.attrs["data-tex"]
                while len(content) > 0 and ' '==content[0]:
                    content = content[1:]
                while len(content) > 0 and ' '==content[-1]:
                    content = content[:-1]
                if len(content) > 0:
                    if article[-3-1:]=='<br>' or article[-1:]=='\n':
                        article += "\n$" + content + "$"
                    else:
                        article += "$" + content + "$"
            else:
                article, number = parser_beautiful(chi, article, number, dircrea, bk)
                # article += nod.text
        elif tag_name=="a":
            linksite = None
            if 'href' in chi.attrs.keys():
                linksite = chi.attrs['href']
            if linksite:
                linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
                if len(article) > 0 and article[-1]=='\n':
                    article += "["+chi.text+"]"+"("+linksite + ")"
                else:
                    article += "\n\n["+chi.text+"]"+"("+linksite + ")"
        elif tag_name=='b' or tag_name=='strong':
            if len(cll) > 1:
                article, number = parser_beautiful(chi, article, number, dircrea, True)
            else:
                txt = chi.text
                while len(txt) > 0 and txt[-1] == " ":
                    txt = txt[:-1]
                article += " **" + txt + "** "
        elif tag_name=="figure":
            noscript = chi.find_all('noscript')
            if len(noscript) > 0:
                chi.noscript.extract()
            imgchunk = chi.find_all('img')
            for i in range(len(imgchunk)):
                imglink = None
                if 'data-original' in imgchunk[i].attrs.keys():
                    imglink = imgchunk[i].attrs["data-original"]

                if 'data-actualsrc' in imgchunk[i].attrs.keys():
                    imglink = imgchunk[i].attrs['data-actualsrc']

                if imglink==None:
                    imglink = imgchunk[i].attrs["src"]
                try:
                    response = requests.get(imglink, timeout=30)
                except:
                    try:
                        response = requests.get(imglink, timeout=30)
                    except:
                        continue
                if response.status_code==200:
                    with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
                        obj.write(response.content)

                    files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
                    response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
                                               files=files,timeout=30)
                    if response1.status_code==200:
                        continue
                        # jsons = json.loads(response1.text)
                        # print(jsons)
                        # article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']

                    requests.put(imglink, timeout=30)
                    number += 1
                    crawlsleep(1)
        elif tag_name=="div":
            prenode = chi.find_all('code')
            if len(prenode) > 0:
                for i in prenode:
                    article += "\n\n```\n" + i.text + "\n```\n\n"
            else:
                article, number = parser_beautiful(chi, article, number, dircrea, bk)
                article += "\n\n"
    if bk:
        article += "**"
    return article, number


# Xpanx.com 专业网络爬虫程序定制，加微信 LiteMango（付费）
# 设置webdriver路径，替换为你的webdriver路径
abspath = os.path.abspath(__file__)

driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe')
service = Service(executable_path=driverpath)
edge_options = EdgeOptions()

#https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec
edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
edge_options.add_experimental_option('useAutomationExtension', False)
edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹

# 初始化webdriver
driver = webdriver.Edge(options=edge_options, service = service)

# 微信公众号文章的 URL
url = 'https://mp.weixin.qq.com/s?__biz=MzI4Njg5MDA5NA==&mid=2247483973&idx=2&sn=483265ffa9087ca956ec2d637119a5f8&chksm=ebd74344dca0ca5298b894fbb706c26ee942a423e858e27679f06df4b83899e1a97cc9d5eb97&scene=21###wechat_redirect'

# 打开页面
driver.get(url)

# 等待一定时间让页面加载完成
time.sleep(5)

# 获取页面的源代码
html = driver.page_source

# 解析 HTML
soup = BeautifulSoup(html, 'html.parser')

# 提取文章标题
title = soup.find('h1', {'class': 'rich_media_title'}).get_text(strip=True)

# 提取文章内容
content = soup.find('div', {'id': 'js_content'})
richtext = driver.find_element(By.TAG_NAME, "section")
article = ''
print(content)
inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
innerHTML = BeautifulSoup(inner, "html.parser")
res,num = parser_beautiful(innerHTML,article,0,'d://',False)
# 将 HTML 转换为 Markdown
#markdown = md(str(content))

# 将 Markdown 写入文件
with open(f'{title}.md', 'w', encoding='utf-8') as f:
    f.write(res)

# 关闭webdriver
driver.quit()