zhihu_spider/crawler.py

1123 lines
50 KiB
Python
Raw Normal View History

2023-07-03 05:26:30 +00:00
#################
###ZouJiu-202306
#################
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver import EdgeOptions
import os
from selenium.webdriver.common.by import By
import time
import pickle
import json
from selenium.webdriver.support.wait import WebDriverWait
import requests
from copy import deepcopy
import argparse
from datetime import datetime
# from selenium.webdriver.common import keys
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.actions.wheel_input import ScrollOrigin
# import numpy as np
import shutil
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.print_page_options import PrintOptions
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support import expected_conditions as EC
import base64
from zipfile import ZipFile
2023-07-29 10:05:34 +00:00
from bs4 import BeautifulSoup
2023-07-03 05:26:30 +00:00
abspath = os.path.abspath(__file__)
filename = abspath.split(os.sep)[-1]
abspath = abspath.replace(filename, "")
import sys
sys.path.append(abspath)
# wkhtmltopdf_path = os.path.join(abspath, r'wkhtmltopdf\bin\wkhtmltopdf.exe')
# sys.path.append(wkhtmltopdf_path)
from thinkdeal import *
def save_cookie(driverkkk, path):
#https://stackoverflow.com/questions/45417335/python-use-cookie-to-login-with-selenium
with open(path, 'wb') as filehandler:
pickle.dump(driverkkk.get_cookies(), filehandler)
def load_cookie(driverkkk, path):
#https://stackoverflow.com/questions/45417335/python-use-cookie-to-login-with-selenium
with open(path, 'rb') as cookiesfile:
cookies = pickle.load(cookiesfile)
for cookie in cookies:
driverkkk.add_cookie(cookie)
def crawlsleep(times):
time.sleep(times)
def now():
return time.time()
def nowtime():
nowtm = datetime.fromtimestamp(time.time()).isoformat().replace(":", "_")
return nowtm
def edgeopen(driverpath):
service=Service(executable_path=driverpath)
edge_options = EdgeOptions()
#https://stackoverflow.com/questions/53039551/selenium-webdriver-modifying-navigator-webdriver-flag-to-prevent-selenium-detec
edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
edge_options.add_experimental_option('useAutomationExtension', False)
edge_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
edge_options.add_argument("disable-blink-features=AutomationControlled")#就是这一行告诉chrome去掉了webdriver痕迹
# #https://stackoverflow.com/questions/56897041/how-to-save-opened-page-as-pdf-in-selenium-python
# settings = {
# "recentDestinations": [{
# "id": "Save as PDF",
# "origin": "local",
# "account": "",
# }],
# "selectedDestinationId": "Save as PDF",
# "version": 2
# }
# #https://www.apiref.com/java11-zh/java.desktop/javax/print/attribute/standard/MediaSize.ISO.html
# settings = {
# "recentDestinations": [{
# "id": "Save as PDF",
# "origin": "local",
# "account": ""
# }],
# "selectedDestinationId": "Save as PDF",
# "version": 2,
# "isHeaderFooterEnabled": False,
# "mediaSize": {
# "height_microns": 297000,
# "name": "ISO_A4",
# "width_microns": 210000,
# "custom_display_name": "A4"
# },
# "customMargins": {"margin": 0},
# "marginsType": 3,
# # "scaling": 130,
# # "scalingType": 3,
# # "scalingTypePdf": 3,
# "isCssBackgroundEnabled": True
# }
# prefs = {
# 'printing.print_preview_sticky_settings.appState': json.dumps(settings),
# 'savefile.default_directory': pdfpath,
# }
# edge_options.add_experimental_option('prefs', prefs)
# edge_options.add_argument('--kiosk-printing')
# https://www.selenium.dev/documentation/webdriver/drivers/options/#pageloadstrategy
# https://stackoverflow.com/questions/44503576/selenium-python-how-to-stop-page-loading-when-certain-element-gets-loaded
# edge_options.add_argument(page_load_strategy, 'normal')
# if strategy:
edge_options.page_load_strategy = 'normal'
# cap = DesiredCapabilities.EDGE
# cap['pageLoadStrategy'] = 'none'
driver = webdriver.Edge(options=edge_options, service = service)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
driver.set_script_timeout(20)
return driver
def login(driver):
driver.find_elements(By.CLASS_NAME, "SignFlow-tab")[1].click()
# driver.find_elements(By.CLASS_NAME, "username-input")[0].send_keys("")
# driver.find_elements(By.CLASS_NAME, "username-input")[1].send_keys("")
# driver.find_element(By.CLASS_NAME, "SignFlow-submitButton").click()
2024-05-20 10:52:26 +00:00
time.sleep(20)
2023-07-03 05:26:30 +00:00
# WebDriverWait(driver, timeout=60).until(lambda d:d.find_element(By.CLASS_NAME, "TopstoryTabs-link"))
return driver
def crawl_article_links(driver:webdriver, username:str):
#crawl articles links
articles = r'https://www.zhihu.com/people/zoujiu1/posts'
articles_one = r'https://www.zhihu.com/people/zoujiu1/posts?page='
article_detail = r'https://zhuanlan.zhihu.com/p/'
driver.get(articles.replace("zoujiu1", username))
pages = driver.find_elements(By.CLASS_NAME, 'PaginationButton')[-2]
assert isinstance(int(pages.text), int)
maxpages = int(pages.text)
all_article_detail = {}
#how many pages of articles
for p in range(1, maxpages + 1):
driver.get(articles_one + str(p))
items = driver.find_elements(By.CLASS_NAME, "ArticleItem")
#crawl article one by one
for a in range(len(items)):
introduce = items[a].get_attribute("data-zop")
itemId = json.loads(introduce)
links = items[a].find_elements(By.TAG_NAME, 'a')[0].get_attribute('href')
# id = itemId['itemId']
title = str(itemId['title'])
all_article_detail[str(title)] = links #article_detail + str(id)
crawlsleep(sleeptime)
with open(os.path.join(articledir, 'article.txt'), 'w', encoding='utf-%d'%(6+2)) as obj:
for key, val in all_article_detail.items():
obj.write(val + " " + key + '\n')
def crawl_answers_links(driver:webdriver, username:str):
#crawl answers links
answer = r'https://www.zhihu.com/people/zoujiu1/answers'
answer_one = r'https://www.zhihu.com/people/zoujiu1/answers?page='
driver.get(answer.replace("zoujiu1", username))
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "Pagination"))
pages = driver.find_elements(By.CLASS_NAME, 'PaginationButton')[-2]
assert isinstance(int(pages.text), int)
maxpages = int(pages.text)
all_answer_detail = []
#how many pages of answers
for p in range(1, maxpages + 1):
driver.get(answer_one + str(p))
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "Pagination"))
items = driver.find_elements(By.CLASS_NAME, "AnswerItem")
#crawl answer one by one
for i in range(len(items)):
introduce = items[i].get_attribute("data-zop")
itemId = json.loads(introduce)
id = itemId['itemId']
title = str(itemId['title'])
links = items[i].find_elements(By.TAG_NAME, 'a')[0].get_attribute('href')
all_answer_detail.append([links, str(title)])
crawlsleep(sleeptime)
with open(os.path.join(answerdir, 'answers.txt'), 'w', encoding='utf-8') as obj:
for links, title in all_answer_detail:
obj.write(links + " " + title + '\n')
def crawl_think_links(driver:webdriver, username:str):
#crawl think links
think = r'https://www.zhihu.com/people/zoujiu1/pins'
think_one = r'https://www.zhihu.com/people/zoujiu1/pins?page='
driver.get(think.replace("zoujiu1", username))
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "Pagination"))
pages = driver.find_elements(By.CLASS_NAME, 'PaginationButton')[-2]
assert isinstance(int(pages.text), int)
maxpages = int(pages.text)
# all_think_detail = []
#how many pages of think
allbegin = now()
numberpage = 1e-6
for p in range(1, maxpages + 1):
begin = now()
driver.get(think_one + str(p))
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "Pagination"))
items = driver.find_elements(By.CLASS_NAME, "PinItem")
#crawl answer one by one
for i in range(len(items)):
RichContent = items[i].find_element(By.CLASS_NAME, 'RichContent-inner')
clockitem = items[i].find_element(By.CLASS_NAME, 'ContentItem-time')
try:
WebDriverWait(items[i], timeout=10).until(lambda d: len(d.text) > 2)
except:
driver.get(think_one + str(p))
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "Pagination"))
items = driver.find_elements(By.CLASS_NAME, "PinItem")
RichContent = items[i].find_element(By.CLASS_NAME, 'RichContent-inner')
clockitem = items[i].find_element(By.CLASS_NAME, 'ContentItem-time')
WebDriverWait(items[i], timeout=10).until(lambda d: len(d.text) > 2)
# clockspan = clockitem.find_element(By.TAG_NAME, 'span')
clock = clockitem.text
clock = clock[3 + 1:].replace(" ", "_").replace(":", "_")
dirthink = os.path.join(thinkdir, clock)
if os.path.exists(dirthink):
continue
os.makedirs(dirthink, exist_ok=True)
try:
RichContent.find_element(By.CLASS_NAME, 'Button').click()
WebDriverWait(items[i], timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "RichContent-inner"))
RichContent = items[i].find_element(By.CLASS_NAME, 'RichContent-inner')
except:
pass
content = RichContent.find_element(By.CLASS_NAME, 'RichText')
links_col = content.find_elements(By.TAG_NAME, 'a')
links = []
for itext in links_col:
try:
links.append(itext.get_attribute("href"))
except:
continue
text = content.text.strip()
if len(text)!=0:
with open(os.path.join(dirthink, clock+".txt"), 'w', encoding='utf-8') as obj:
obj.write(text.replace("<br>", '\n').replace("<br data-first-child=\"\">", '\n')+"\n")
for itext in links:
obj.write(itext + "\n")
# all_think_detail.append([text])
try:
items[i].find_elements(By.CLASS_NAME, 'Image-PreviewVague')[0].click()
except:
continue
cnt = 0
while True:
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "ImageGallery-Inner"))
img = driver.find_element(By.CLASS_NAME, 'ImageGallery-Inner').find_element(By.TAG_NAME, 'img')
imglink = img.get_attribute('data-original')
if imglink==None:
imglink = img.get_attribute("src")
try:
response = requests.get(imglink, timeout=30)
except:
try:
response = requests.get(imglink, timeout=30)
except:
continue
if response.status_code==200:
with open(os.path.join(dirthink, clock + "_" + str(cnt) + '.jpg'), 'wb') as obj:
obj.write(response.content)
cnt += 1
crawlsleep(sleeptime)
try:
disable = driver.find_element(By.CLASS_NAME, 'ImageGallery-arrow-right')
if 'disabled' in disable.get_attribute('class'):
driver.find_element(By.CLASS_NAME, 'ImageGallery-close').click()
break
else:
disable.click()
except:
break
crawlsleep(sleeptime)
end = now()
print("爬取一篇想法耗时:", clock, round(end - begin, 3))
logfp.write("爬取一篇想法耗时:" +clock + " "+ str(round(end - begin, 3)) + "\n")
numberpage += 1
# crawlsleep(600)
allend = now()
print("平均爬取一篇想法耗时:", round((allend - allbegin) / numberpage, 3))
logfp.write("平均爬取一篇想法耗时:" + str(round((allend - allbegin) / numberpage, 3)) + "\n")
dealthink(thinkdir)
def cleartxt(kkk):
while ' 'in kkk:
kkk = kkk.replace(" ", "")
while "\n" in kkk:
kkk = kkk.replace("\n", "")
return kkk
2023-07-29 10:05:34 +00:00
def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
if not innerHTML:
return article, number
if bk:
article += "**"
if isinstance(innerHTML, str):
article += innerHTML.text
return article, number
for chi in innerHTML.children:
# article, number = parser_beautiful(chi, article, number, dircrea, bk)
tag_name = chi.name
if isinstance(chi, str):
article += chi.text
continue
else:
cll = [c for c in chi.children]
if tag_name in ['table', 'tbody', 'tr', 'td', 'u', 'em']:
article, number = parser_beautiful(chi, article, number, dircrea, bk)
elif tag_name=="br":
article += "\n"
elif tag_name=="p":
article, number = parser_beautiful(chi, article, number, dircrea, bk)
article += "\n\n"
# elif tag_name=="br":
# article += "<br>\n"
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
article += '#' * int(tag_name[-1]) + ' '
article, number = parser_beautiful(chi, article, number, dircrea, bk)
article += '\n\n'
elif tag_name=="span":
datatex = None
classc = None
if 'data-tex' in chi.attrs.keys():
datatex = chi.attrs["data-tex"]
if 'class' in chi.attrs.keys():
classc = chi.attrs["class"]
if datatex and classc and 'ztext-math' in classc:
2023-10-21 13:45:55 +00:00
content = chi.attrs["data-tex"]
while len(content) > 0 and ' '==content[0]:
content = content[1:]
while len(content) > 0 and ' '==content[-1]:
content = content[:-1]
if len(content) > 0:
if article[-3-1:]=='<br>' or article[-1:]=='\n':
article += "\n$" + content + "$"
else:
article += "$" + content + "$"
2023-07-29 10:05:34 +00:00
else:
article, number = parser_beautiful(chi, article, number, dircrea, bk)
# article += nod.text
elif tag_name=="a":
linksite = None
if 'href' in chi.attrs.keys():
linksite = chi.attrs['href']
if linksite:
linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
2023-08-26 11:16:16 +00:00
if len(article) > 0 and article[-1]=='\n':
2023-07-29 10:05:34 +00:00
article += "["+chi.text+"]"+"("+linksite + ")"
else:
article += "\n\n["+chi.text+"]"+"("+linksite + ")"
elif tag_name=='b' or tag_name=='strong':
if len(cll) > 1:
article, number = parser_beautiful(chi, article, number, dircrea, True)
else:
txt = chi.text
while len(txt) > 0 and txt[-1] == " ":
txt = txt[:-1]
article += " **" + txt + "** "
elif tag_name=="figure":
noscript = chi.find_all('noscript')
if len(noscript) > 0:
chi.noscript.extract()
imgchunk = chi.find_all('img')
for i in range(len(imgchunk)):
imglink = None
if 'data-original' in imgchunk[i].attrs.keys():
imglink = imgchunk[i].attrs["data-original"]
if 'data-actualsrc' in imgchunk[i].attrs.keys():
imglink = imgchunk[i].attrs['data-actualsrc']
if imglink==None:
imglink = imgchunk[i].attrs["src"]
try:
response = requests.get(imglink, timeout=30)
except:
try:
response = requests.get(imglink, timeout=30)
except:
continue
if response.status_code==200:
# article += '''<img src="%d.jpg"/>'''%number
dircrea.replace('.','')
2023-07-29 10:05:34 +00:00
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
obj.write(response.content)
files = {'image': open(os.path.join(dircrea, str(number) + '.jpg'), 'rb')}
response1 = requests.post('https://www.testingcloud.club/sapi/api/image_upload',
files=files,timeout=30)
if response1.status_code==200:
2024-05-20 10:52:26 +00:00
if(response1.text != ''):
jsons = json.loads(response1.text)
print(jsons)
article += ''' <img src="https://www.testingcloud.club/sapi/api/image_download/%s" width="100%%"/> \n\n'''%jsons['url']
requests.put(imglink, timeout=30)
2023-07-29 10:05:34 +00:00
number += 1
crawlsleep(sleeptime)
elif tag_name=="div":
prenode = chi.find_all('code')
if len(prenode) > 0:
for i in prenode:
article += "\n\n```\n" + i.text + "\n```\n\n"
2023-10-20 12:51:29 +00:00
else:
article, number = parser_beautiful(chi, article, number, dircrea, bk)
2023-10-20 13:25:03 +00:00
article += "\n\n"
2023-07-29 10:05:34 +00:00
if bk:
article += "**"
return article, number
2023-07-23 05:32:37 +00:00
def recursion(nod, article, number, driver, dircrea, bk=False):
2023-07-03 05:26:30 +00:00
if isinstance(nod, dict):
if 'nodeName' in nod.keys() and nod['nodeName']=='#text':
kkk = cleartxt(nod['textContent'])
if len(kkk) > 0:
2023-07-23 05:32:37 +00:00
if bk:
article += "**"
2023-07-03 05:26:30 +00:00
article += nod['textContent']
2023-07-23 05:32:37 +00:00
if bk:
article += "**"
2023-07-03 05:26:30 +00:00
return article, number
elif isinstance(nod, webdriver.remote.webelement.WebElement):
tag_name = nod.tag_name
if tag_name=="br":
article += "<br>\n"
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
article += "\n" + '#' * int(tag_name[-1]) + ' '
2023-07-29 10:05:34 +00:00
try:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea, bk)
except:
pass
2023-07-03 05:26:30 +00:00
article += '\n'
elif tag_name=="span":
datatex = nod.get_attribute("data-tex")
classc = nod.get_attribute("class")
if datatex and classc and 'ztext-math' in classc:
if article[-3-1:]=='<br>' or article[-1:]=='\n':
article += "\n$" + nod.get_attribute("data-tex") + "$"
else:
article += "$" + nod.get_attribute("data-tex") + "$"
else:
imgchunk = nod.find_elements(By.TAG_NAME, 'img')
achunk = nod.find_elements(By.TAG_NAME, 'a')
if len(imgchunk)==0 and len(achunk)==0:
2023-07-23 05:32:37 +00:00
if bk:
article += "**"
2023-07-03 05:26:30 +00:00
article += nod.text
2023-07-23 05:32:37 +00:00
if bk:
article += "**"
2023-07-03 05:26:30 +00:00
else:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
2023-07-23 05:32:37 +00:00
article, number = recursion(pnode, article, number, driver, dircrea, bk)
2023-07-03 05:26:30 +00:00
# else:
# formula_span = nod.find_elements(By.CLASS_NAME, "ztext-math")
# for jf in range(len(formula_span)):
# ele = formula_span[jf]
# article += "$" + ele.get_attribute("data-tex") + "$"
elif tag_name=="a":
linksite = nod.get_attribute("href")
if linksite:
linksite = linksite.replace("//link.zhihu.com/?target=https%3A", "").replace("//link.zhihu.com/?target=http%3A", "")
if article[-3-1:]=='<br>' or article[-1:]=='\n':
article += "\n\n["+nod.text+"]"+"("+linksite + ")"
else:
article += "["+nod.text+"]"+"("+linksite + ")"
elif tag_name=="b" or tag_name=="strong":
2023-07-28 12:38:18 +00:00
try:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea, True)
except:
txt = nod.text
while len(txt) > 0 and txt[-1] == " ":
txt = txt[:-1]
article += " **" + txt + "** "
2023-07-03 05:26:30 +00:00
elif tag_name=="em":
2023-07-23 05:32:37 +00:00
if bk:
article += "**"
2023-07-03 05:26:30 +00:00
article += nod.text
2023-07-23 05:32:37 +00:00
if bk:
article += "**"
2023-07-03 05:26:30 +00:00
# elif tag_name=='td':
# article += nod.text
elif tag_name in ['table', 'tbody', 'tr', 'td', 'u']:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
2023-07-23 05:32:37 +00:00
article, number = recursion(pnode, article, number, driver, dircrea, bk)
2023-07-03 05:26:30 +00:00
elif tag_name=='p':
2023-07-28 12:33:12 +00:00
try:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea, bk)
except:
article += nod.text
2023-07-29 10:05:34 +00:00
article += "\n"
2023-07-03 05:26:30 +00:00
elif tag_name=="div":
# atags = nod.find_elements(By.TAG_NAME, 'a')
prenode = nod.find_elements(By.TAG_NAME, 'code')
if len(prenode) > 0:
for i in prenode:
article += "<br>\n```\n" + i.text + "\n```\n<br>"
else:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes:
2023-07-23 05:32:37 +00:00
article, number = recursion(pnode, article, number, driver, dircrea, bk)
2023-07-03 05:26:30 +00:00
elif tag_name=="figure":
imgchunk = nod.find_elements(By.TAG_NAME, 'img')
for i in range(len(imgchunk)):
imglink = imgchunk[i].get_attribute("data-original")
if imglink==None:
imglink = imgchunk[i].get_attribute("src")
try:
response = requests.get(imglink, timeout=30)
except:
try:
response = requests.get(imglink, timeout=30)
except:
continue
if response.status_code==200:
article += ''' <img src="%d.jpg" width="100%%"/> '''%number
# article += '''<img src="%d.jpg"/>'''%number
with open(os.path.join(dircrea, str(number) + '.jpg'), 'wb') as obj:
obj.write(response.content)
response = requests.post(url, files=files)
2023-07-03 05:26:30 +00:00
number += 1
crawlsleep(sleeptime)
return article, number
def crawl_article_detail(driver:webdriver):
website_col = {}
for i in os.listdir(articledir):
try:
kk = int(i)
shutil.rmtree(os.path.join(articledir, i))
except:
pass
with open(os.path.join(articledir, 'article.txt'), 'r', encoding='utf-8') as obj:
for i in obj.readlines():
i = i.strip()
ind = i.index(" ")
website = i[:ind]
title = i[ind+1:].replace(" ", "_").replace("\n", "")
website_col[website] = title
allbegin = now()
numberpage = 1e-6
for website, title in website_col.items():
begin = now()
nam = title.replace(":", "_").replace("?", ";"). \
replace("/","_").replace("\\","_").replace("\"", "_").\
replace("*","_").replace("|", "_").replace(" ", "_").replace("", "").replace("", "").\
replace("<", "小于").replace(">", "大于").replace("(", "").\
replace(")", "")
temp_name = nam #str(np.random.randint(999999999)) + str(np.random.randint(999999999))
if len(temp_name) > 200:
temp_name = temp_name[:100]
2024-05-20 10:52:26 +00:00
2023-07-03 05:26:30 +00:00
direxit = False
fileexit = False
dirname = ''
filesize = 0
kkk = -9
for i in os.listdir(articledir):
if nam in i and os.path.isdir(os.path.join(articledir, i)):
direxit = True
dirname = i
fileexit = os.path.exists(os.path.join(articledir, dirname, nam + ".pdf"))
if fileexit:
filesize = os.path.getsize(os.path.join(articledir, dirname, nam + ".pdf"))
# break
if direxit and fileexit and filesize > 0:
if '_IP_' in dirname:
filnam = dirname[16+1:].split("_IP_")[0]
else:
filnam = dirname[16+1:][:-1]
if filnam == nam:
kkk = 9
break
if kkk > 0:
continue
dircrea = os.path.join(articledir, temp_name)
os.makedirs(dircrea, exist_ok = True)
#get article text
driver.get(website)
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "Post-Topics"))
#https://stackoverflow.com/questions/61877719/how-to-get-current-scroll-height-in-selenium-python
scrollHeight = driver.execute_script('''return document.documentElement.scrollHeight''')
footer = driver.find_element(By.TAG_NAME, "html")
scroll_origin = ScrollOrigin.from_element(footer, 0, -60)
ActionChains(driver).scroll_from_origin(scroll_origin, 0, -100000).perform()
for i in range(18):
try:
ActionChains(driver).scroll_from_origin(scroll_origin, 0, scrollHeight//18).perform()
except:
try:
ActionChains(driver).scroll_from_origin(scroll_origin, 0, -scrollHeight//18).perform()
except:
pass
crawlsleep(0.8)
#remove noneed element
try:
driver.execute_script('''document.getElementsByClassName("Post-Sub")[0].remove();''')
except:
pass
try:
driver.execute_script('''document.getElementsByClassName("ColumnPageHeader-Wrapper")[0].remove();''')
except:
pass
try:
driver.execute_script('''document.getElementsByClassName("RichContent-actions")[0].remove();''')
except:
pass
2023-07-29 10:05:34 +00:00
if MarkDown_FORMAT:
richtext = driver.find_element(By.CLASS_NAME, "Post-RichText")
titletext = driver.find_element(By.CLASS_NAME, "Post-Title")
# article_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
article = ""
number = 0
article = article + '[转载自](%s)' % website + '\n\n'
2023-07-29 10:05:34 +00:00
# for nod in article_childNodes:
# article, number = recursion(nod, article, number, driver, dircrea)
inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
innerHTML = BeautifulSoup(inner, "html.parser")
article, number = parser_beautiful(innerHTML, article, number, dircrea)
article = article.replace("修改\n", "").replace("开启赞赏\n", "开启赞赏, ").replace("添加评论\n", "").replace("分享\n", "").\
replace("收藏\n", "").replace("设置\n", "")
tle = titletext.text
article += "<br>\n\n["+driver.current_url+"](" + driver.current_url + ")<br>\n"
if len(article) > 0:
try:
f=open(os.path.join(dircrea, nam + "_formula_" + ".md"), 'w', encoding='utf-8')
f.close()
except:
nam = nam[:len(nam)//2]
with open(os.path.join(dircrea, nam + "_formula_" + ".md"), 'w', encoding='utf-8') as obj:
obj.write("# " + tle+"\n\n")
if len(article) > 0:
obj.write(article + "\n\n\n")
2024-05-20 10:52:26 +00:00
#https://www.testingcloud.club/sapi/api/article_tree
resp = requests.put("https://www.testingcloud.club/sapi/api/article_tree",json.dumps({
"title": nam,
"content":article,
"spend_time":1,
"father":2500,
"level":1,
"author":"sds",
"is_public":1,
"author":"admin"
}))
print(resp)
2023-07-03 05:26:30 +00:00
# article to pdf
clocktxt = driver.find_element(By.CLASS_NAME, "Post-NormalMain").find_element(By.CLASS_NAME, "ContentItem-time")
crawlsleep(1)
url = driver.current_url
driver.execute_script("const para = document.createElement(\"h2\"); \
const br = document.createElement(\"br\"); \
const node = document.createTextNode(\"%s\");\
para.appendChild(node);\
const currentDiv = document.getElementsByClassName(\"Post-Header\")[0];\
currentDiv.appendChild(br); \
currentDiv.appendChild(para);"%url \
)
clock = clocktxt.text[3+1:].replace(" ", "_").replace(":", "_")
pagetopdf(driver, dircrea, temp_name, nam, articledir, url, Created=clock)
crawlsleep(sleeptime)
#https://stackoverflow.com/questions/23359083/how-to-convert-webpage-into-pdf-by-using-python
#https://github.com/JazzCore/python-pdfkit
# if article_to_jpg_pdf_markdown:
# config = pdfkit.configuration(wkhtmltopdf = wkhtmltopdf_path)
# pdfkit.from_url(website, os.path.join(dircrea, nam_pinyin+".pdf"), configuration = config)
end = now()
print("爬取一篇article耗时", title, round(end - begin, 3))
logfp.write("爬取一篇article耗时" + title + " " + str(round(end - begin, 3)) + "\n")
numberpage += 1
# crawlsleep(600)
allend = now()
print("平均爬取一篇article耗时", round((allend - allbegin) / numberpage, 3))
logfp.write("平均爬取一篇article耗时" + str(round((allend - allbegin) / numberpage, 3)) + "\n")
def pagetopdf(driver, dircrea, temp_name, nam, destdir, url, Created=""):
fileexit = os.path.exists(os.path.join(dircrea, temp_name + ".pdf"))
if fileexit:
try:
os.remove(os.path.join(dircrea, temp_name + ".pdf"))
except:
pass
printop = PrintOptions()
2023-07-30 04:10:57 +00:00
printop.shrink_to_fit = True
2023-07-03 05:26:30 +00:00
# printop.margin_left = 0
# printop.margin_right = 0
# printop.margin_top = 0
# printop.margin_bottom = 0
2023-07-30 04:10:57 +00:00
# printop.page_height = 29.7
# printop.page_width = 21
2023-07-03 05:26:30 +00:00
printop.background = True
2023-07-30 04:10:57 +00:00
printop.scale = 1.0
2023-07-03 05:26:30 +00:00
pdf = driver.print_page(print_options=printop)
with open(os.path.join(dircrea, nam + ".pdf"), 'wb') as obj:
obj.write(base64.b64decode(pdf))
# driver.execute_script('window.print();')
clock = Created #clocktxt.text[3+1:].replace(" ", "_").replace(":", "_")
with open(os.path.join(dircrea, clock+".txt"), 'w', encoding='utf-8') as obj:
obj.write(clock+"\n")
obj.write(url)
clocktmp = clock.split(".")[0].replace("T", "_")
clock = clocktmp.split("")[0]
address = ""
try:
address += clocktmp.split("")[1]
except:
pass
try:
os.rename(dircrea, os.path.join(destdir, clock + "_" + nam + "_" + address))
except:
crawlsleep(3+addtime)
try:
os.rename(dircrea, os.path.join(destdir, clock + "_" + nam + "_" + address))
except:
pass
def crawl_answer_detail(driver:webdriver):
website_col = {}
for i in os.listdir(answerdir):
try:
kk = int(i)
shutil.rmtree(os.path.join(answerdir, i))
except:
pass
with open(os.path.join(answerdir, 'answers.txt'), 'r', encoding='utf-8') as obj:
for i in obj.readlines():
i = i.strip()
ind = i.index(" ")
website = i[:ind]
title = i[ind+1:].replace(" ", "_").replace("\n", "")
website_col[website] = title
allbegin = now()
numberpage = 1e-6
for website, title in website_col.items():
begin = now()
nam = title.replace(":", "_").replace("?", ";"). \
replace("/","_").replace("\\","_").replace("\"", "_").\
replace("*","_").replace("|", "_").replace(" ", "_").replace("", "").replace("", "").\
replace("<", "小于").replace(">", "大于")
if len(nam) > 200:
nam = nam[:100]
temp_name = nam #str(np.random.randint(999999999)) + str(np.random.randint(999999999))
# nam_pinyin = pinyin.get(nam, format='numerical')
# if '不定积分该用什么方' not in title:
# continue
direxit = False
fileexit = False
dirname = ''
filesize = 0
for i in os.listdir(answerdir):
if nam in i and os.path.isdir(os.path.join(answerdir, i)):
direxit = True
dirname = i
fileexit = os.path.exists(os.path.join(answerdir, dirname, nam + ".pdf"))
if fileexit:
filesize = os.path.getsize(os.path.join(answerdir, dirname, nam + ".pdf"))
break
dircrea = os.path.join(answerdir, temp_name)
if direxit and fileexit and filesize > 0:
if '_IP_' in dirname:
filnam = dirname[16+1:].split("_IP_")[0]
else:
filnam = dirname[16+1:][:-1]
if filnam == nam:
continue
os.makedirs(dircrea, exist_ok = True)
#get article text
driver.get(website)
2024-05-20 10:52:26 +00:00
time.sleep(10)
# WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.CLASS_NAME, "AnswerItem-editButtonText"))
2023-07-03 05:26:30 +00:00
#https://stackoverflow.com/questions/61877719/how-to-get-current-scroll-height-in-selenium-python
scrollHeight = driver.execute_script('''return document.getElementsByClassName("QuestionAnswer-content")[0].scrollHeight''')
footer = driver.find_element(By.TAG_NAME, "html")
scroll_origin = ScrollOrigin.from_element(footer, 0, 0)
ActionChains(driver).scroll_from_origin(scroll_origin, 0, -100000).perform()
for i in range(18):
try:
ActionChains(driver).scroll_from_origin(scroll_origin, 0, scrollHeight//18).perform()
except:
try:
ActionChains(driver).scroll_from_origin(scroll_origin, 0, -scrollHeight//18).perform()
except:
pass
crawlsleep(0.8)
ActionChains(driver).scroll_from_origin(scroll_origin, 0, -100000).perform()
article = ""
number = 0
try:
QuestionRichText = driver.find_element(By.CLASS_NAME, "QuestionRichText")
button = QuestionRichText.find_element(By.CLASS_NAME, "QuestionRichText-more")
WebDriverWait(driver, timeout=20).until(EC.element_to_be_clickable((By.CLASS_NAME, "QuestionRichText-more")))
crawlsleep(max(2, sleeptime))
button.click()
question_RichText = QuestionRichText.find_element(By.CLASS_NAME, "RichText")
2023-07-29 10:05:34 +00:00
# question_childNodes = driver.execute_script("return arguments[0].childNodes;", question_RichText)
2023-07-03 05:26:30 +00:00
article += "# question <br>\n"
2023-07-29 10:05:34 +00:00
# for nod in question_childNodes:
# article, number = recursion(nod, article, number, driver, dircrea)
inner = driver.execute_script("return arguments[0].innerHTML;", question_RichText)
innerHTML = BeautifulSoup(inner, "html.parser")
article, number = parser_beautiful(innerHTML, article, number, dircrea)
2023-07-03 05:26:30 +00:00
except:
pass
article += "<br>\n\n\n# answer <br>\n"
#remove noneed element
try:
driver.execute_script('''document.getElementsByClassName("MoreAnswers")[0].remove();''')
except:
pass
try:
driver.execute_script('''document.getElementsByClassName("ViewAll")[0].remove();''')
except:
pass
try:
driver.execute_script('''document.getElementsByClassName("ViewAll")[0].remove();''')
except:
pass
try:
driver.execute_script('''document.getElementsByClassName("AppHeader")[0].remove();''')
except:
pass
try:
driver.execute_script('''document.getElementsByClassName("Reward")[0].remove();''')
except:
pass
try:
driver.execute_script('''document.getElementsByClassName("Question-sideColumn")[0].remove();''')
except:
pass
Created = "not found"
Modified = "not found"
QuestionAnswer = driver.find_element(By.CLASS_NAME, "QuestionAnswer-content")
richtext = QuestionAnswer.find_element(By.CLASS_NAME, "CopyrightRichText-richText")
Createdtime = QuestionAnswer.find_element(By.CLASS_NAME, "ContentItem-time")
Created = Createdtime.text[4:].replace(" ", "_").replace(":", "_").replace(".", "_")
2023-07-29 10:05:34 +00:00
if MarkDown_FORMAT:
metatext = QuestionAnswer.find_elements(By.TAG_NAME, "meta")
for i in range(len(metatext)):
# if metatext[i].get_attribute("itemprop")=="dateCreated":
# Created = metatext[i].get_attribute("content").replace(" ", "_").replace(":", "_").replace(".", "_")
if metatext[i].get_attribute("itemprop")=="dateModified":
Modified = metatext[i].get_attribute("content").replace(" ", "_").replace(":", "_").replace(".", "_")
2023-07-03 05:26:30 +00:00
2023-07-29 10:05:34 +00:00
# answer_childNodes = driver.execute_script("return arguments[0].childNodes;", richtext)
# for nod in answer_childNodes:
# article, number = recursion(nod, article, number, driver, dircrea)
2023-07-03 05:26:30 +00:00
2023-07-29 10:05:34 +00:00
inner = driver.execute_script("return arguments[0].innerHTML;", richtext)
innerHTML = BeautifulSoup(inner, "html.parser")
article, number = parser_beautiful(innerHTML, article, number, dircrea)
article = article.replace("修改\n", "").replace("开启赞赏\n", "开启赞赏, ").replace("添加评论\n", "").replace("分享\n", "").\
replace("收藏\n", "").replace("设置\n", "")
url = driver.current_url
article += "<br>\n\n["+url+"](" + url + ")<br>\n"
driver.execute_script("const para = document.createElement(\"h2\"); \
const br = document.createElement(\"br\"); \
const node = document.createTextNode(\"%s\");\
para.appendChild(node);\
const currentDiv = document.getElementsByClassName(\"QuestionHeader-title\")[0];\
currentDiv.appendChild(br); \
currentDiv.appendChild(para);"%url \
)
if len(article) > 0:
try:
f=open(os.path.join(dircrea, nam + "_formula_" + ".md"), 'w', encoding='utf-8')
f.close()
except:
nam = nam[:len(nam)//2]
with open(os.path.join(dircrea, nam + "_formula_" + ".md"), 'w', encoding='utf-8') as obj:
obj.write("# "+ title+"\n\n")
if len(article) > 0:
obj.write(article + "\n\n\n")
obj.write("Created: " + Created + "\n")
obj.write("Modified: " + Modified + "\n")
2023-07-03 05:26:30 +00:00
# article to pdf
pagetopdf(driver, dircrea, temp_name, nam, answerdir, url, Created=Created)
crawlsleep(sleeptime)
end = now()
print("爬取一篇回答耗时:", title, round(end - begin, 3))
logfp.write("爬取一篇回答耗时:" +title+" "+ str(round(end - begin, 3)) + "\n")
numberpage += 1
# crawlsleep(600)
allend = now()
print("平均爬取一篇回答耗时:", round((allend - allbegin) / numberpage, 3))
logfp.write("平均爬取一篇回答耗时:" + str(round((allend - allbegin) / numberpage, 3)) + "\n")
def login_loadsavecookie():
website = r"https://www.zhihu.com/signin"
#login and save cookies of zhihu
driver = edgeopen(driverpath)
driver.get(website)
try:
load_cookie(driver, cookie_path)
driver.get(r"https://www.zhihu.com/")
2024-02-04 10:10:44 +00:00
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.ID, 'Popover15-toggle'))
toggle = driver.find_element(By.ID, 'Popover15-toggle')
2023-11-22 12:55:47 +00:00
except Exception as e:
if os.path.exists(cookie_path):
os.remove(cookie_path)
print("浏览器cookie失效了删除了之前的cookie需要再次登录并保存cookie。")
else:
print("需要登陆并保存cookie下次就不用登录了。")
2023-07-03 05:26:30 +00:00
driver = login(driver)
save_cookie(driver, cookie_path)
driver.quit()
exit(0)
try:
driver.find_element(By.ID, 'Popover15-toggle').click()
driver.find_element(By.CLASS_NAME, 'Menu-item').click()
except:
crawlsleep(6)
driver.get(r"https://www.zhihu.com/")
crawlsleep(3)
driver.find_element(By.ID, 'Popover15-toggle').click()
driver.find_element(By.CLASS_NAME, 'Menu-item').click()
url = driver.current_url
username = url.split("/")[-1]
return driver, username
2023-08-29 08:11:05 +00:00
def downloaddriver():
url = "https://msedgedriver.azureedge.net/116.0.1938.62/edgedriver_win64.zip"
2023-07-03 05:26:30 +00:00
if not os.path.exists(driverpath):
2023-08-29 08:11:05 +00:00
ret = requests.get("https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/")
2023-11-11 07:50:01 +00:00
if ret.status_code!=200:
assert ret.status_code!=200
2023-08-29 08:11:05 +00:00
ret = BeautifulSoup(ret.content, 'html.parser')
2023-11-11 07:50:01 +00:00
# divall = ret.find_all('div', class_=r'common-card--lightblue')
ddl = ret.find_all('a')
2023-08-29 08:11:05 +00:00
for k in ddl:
key = k.attrs.keys()
2024-02-04 09:07:30 +00:00
if 'href' not in key:
continue
2023-08-29 08:11:05 +00:00
href = k.attrs['href']
2023-11-11 07:50:01 +00:00
if 'href' in key and "win64" in href and ".zip" in href:
2023-08-29 08:11:05 +00:00
url = href
break
response = requests.get(url)
2023-07-03 05:26:30 +00:00
if response.status_code==200:
with open(os.path.join(abspath, 'msedgedriver/edgedriver.zip'), 'wb') as obj:
obj.write(response.content)
with ZipFile(os.path.join(abspath, 'msedgedriver/edgedriver.zip'), "r") as obj:
obj.extractall(os.path.join(abspath, 'msedgedriver'))
nth = os.path.join(abspath, 'msedgedriver')
for r, d, f in os.walk(nth):
kk = 6
for i in f:
if 'driver' in i and '.exe' in i:
try:
shutil.move(os.path.join(r, i), os.path.join(nth, i))
except:
pass
os.rename(os.path.join(nth, i), os.path.join(nth, "msedgedriver.exe"))
kk = -6
break
if kk < 0:
break
2023-08-29 08:11:05 +00:00
def zhihu():
# #crawl articles links
try:
downloaddriver()
driver, username = login_loadsavecookie()
except Exception as e:
os.remove(os.path.join(abspath, 'msedgedriver', "msedgedriver.exe"))
downloaddriver()
driver, username = login_loadsavecookie()
2023-07-03 05:26:30 +00:00
# #crawl think links
if crawl_think:
crawl_think_links(driver, username)
logfp.write(nowtime() + ', 想法爬取已经好了的\n')
# #crawl articles links
if crawl_article:
if not os.path.exists(os.path.join(articledir, 'article.txt')):
crawl_article_links(driver, username)
logfp.write(nowtime() + ', article weblink爬取已经好了的\n')
else:
if crawl_links_scratch:
os.rename(os.path.join(articledir, 'article.txt'), os.path.join(articledir, 'article_%s.txt'%nowtime()))
crawl_article_links(driver, username)
logfp.write(nowtime() + ', article weblink爬取已经好了的\n')
else:
pass
crawl_article_detail(driver)
logfp.write(nowtime() + ', article爬取已经好了的\n')
# #crawl answers links
if crawl_answer:
if not os.path.exists(os.path.join(answerdir, 'answers.txt')):
crawl_answers_links(driver, username)
logfp.write(nowtime() + ', 回答 weblink爬取已经好了的\n')
else:
if crawl_links_scratch:
os.rename(os.path.join(answerdir, 'answers.txt'), os.path.join(answerdir, 'answers_%s.txt'%nowtime()))
crawl_answers_links(driver, username)
logfp.write(nowtime() + ', 回答 weblink爬取已经好了的\n')
else:
pass
crawl_answer_detail(driver)
logfp.write(nowtime() + ', 回答爬取已经好了的\n')
driver.quit()
if __name__ == "__main__":
#version four.one_zero.zero
driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe')
savepath = deepcopy(abspath)
cookiedir = os.path.join(savepath, 'cookie')
thinkdir = os.path.join(savepath, 'think')
answerdir = os.path.join(savepath, 'answer')
articledir = os.path.join(savepath, 'article')
logdir = os.path.join(savepath, 'log')
logfile = os.path.join(logdir, nowtime() + '_log.txt')
os.makedirs(cookiedir, exist_ok=True)
os.makedirs(thinkdir, exist_ok=True)
os.makedirs(answerdir, exist_ok=True)
os.makedirs(articledir, exist_ok=True)
os.makedirs(logdir, exist_ok=True)
logfp = open(logfile, 'w', encoding='utf-8')
cookie_path =os.path.join(cookiedir, 'cookie_zhihu.pkl')
parser = argparse.ArgumentParser(description=r'crawler zhihu.com, 爬取知乎的想法, 回答, 文章, 包括数学公式')
parser.add_argument('--sleep_time', type=float, default = 6, \
help=r'crawler sleep time during crawling, 爬取时的睡眠时间, 避免给知乎服务器带来太大压力, \
可以日间调试好然后深夜运行爬取人少, 给其他小伙伴更好的用户体验, 避免知乎顺着网线过来找人, 默认: 6s')
parser.add_argument('--computer_time_sleep', type=float, default=0, \
help=r'computer running sleep time 默认:0, 电脑运行速度的sleep时间, 默认:0')
parser.add_argument('--think', action="store_true", help=r'crawl think, 是否爬取知乎的想法, 已经爬取过的想法不会重复爬取, 所以可以多次爬取断了也没关系')
parser.add_argument('--answer', action="store_true", help=r'crawl answer, 是否爬取知乎的回答, 保存到pdf、markdown以及相关图片等已经爬取过的不会重复爬取\
断了再次爬取的话可以配置到--links_scratch事先保存好website')
parser.add_argument('--article', action="store_true", help=r'crawl article, 是否爬取知乎的文章, 保存到pdf、markdown以及相关图片等已经爬取过的不会重复爬取\
断了再次爬取的话可以配置到--links_scratch事先保存好website')
2023-07-29 10:05:34 +00:00
parser.add_argument('--MarkDown', action="store_true", help=r'save MarkDown')
2023-07-03 05:26:30 +00:00
parser.add_argument('--links_scratch', action="store_true", \
help=r'crawl links scratch for answer or article, 是否使用已经保存好的website和title, 否则再次爬取website')
args = parser.parse_args()
sleeptime = args.sleep_time
crawl_think = args.think
crawl_answer = args.answer
crawl_article = args.article
crawl_links_scratch = args.links_scratch
addtime = args.computer_time_sleep
2023-07-29 10:05:34 +00:00
MarkDown_FORMAT = args.MarkDown
2023-07-03 05:26:30 +00:00
# crawl_think = False
2023-07-30 04:10:57 +00:00
# crawl_article = True
2023-07-03 05:26:30 +00:00
# crawl_answer = True
# crawl_links_scratch = False
2023-07-29 10:05:34 +00:00
# MarkDown_FORMAT = True
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --think --MarkDown
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --article --MarkDown
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --answer --MarkDown
2023-07-30 04:10:57 +00:00
# python.exe c:/Users/10696/Desktop/access/zhihu/crawler.py --think --answer --article --MarkDown
2023-07-03 05:26:30 +00:00
zhihu()
2023-07-29 10:05:34 +00:00
# try:
# crawl_links_scratch = False
# zhihu()
# except:
# time.sleep(600)
# try:
# zhihu()
# except:
# time.sleep(600)
# zhihu()
2023-10-20 13:25:03 +00:00
logfp.close()