Compare commits

...

10 Commits

Author SHA1 Message Date
ZouJiu ee0478010d findelement 2024-02-04 18:10:44 +08:00
ZouJiu 7d0e35d4f0 crawler 2024-02-04 17:07:30 +08:00
ZouJiu b9c496567f cookie not available, need login again 2023-11-22 20:55:47 +08:00
ZouJiu 7d40cf6c41 auto-download edgedriver window 2023-11-11 15:50:01 +08:00
ZouJiu 4cf647fe6a Merge branch 'master' of https://github.com/ZouJiu1/zhihu_spider_selenium 2023-10-21 21:46:06 +08:00
ZouJiu 17055b25e9 formula remove space 2023-10-21 21:45:55 +08:00
九是否随意的称呼 ef9f67d93d
Update crawler.py 2023-10-20 21:25:03 +08:00
ZouJiu f7f59b53d9 div parser website 2023-10-20 21:17:19 +08:00
ZouJiu b89f9a70e3 div content crawler 2023-10-20 20:51:29 +08:00
ZouJiu 469efcc0ce driver download lastest 2023-08-29 16:11:05 +08:00
2 changed files with 49 additions and 10 deletions

View File

@ -106,6 +106,7 @@ python.exe crawler.py --think --article --answer --MarkDown
### 注意 ### 注意
1、需要较好的网速本机网速测验是下载100Mbps上传60Mbps低点也可以的不是太慢太卡就行[https://www.speedtest.cn/](https://www.speedtest.cn/)<br> 1、需要较好的网速本机网速测验是下载100Mbps上传60Mbps低点也可以的不是太慢太卡就行[https://www.speedtest.cn/](https://www.speedtest.cn/)<br>
2、爬取时设置了睡眠时间, 避免给知乎服务器带来太大压力,可以日间调试好,然后深夜运行爬取人少, 给其他小伙伴更好的用户体验, 避免知乎顺着网线过来找人,默认**6**s<br> 2、爬取时设置了睡眠时间, 避免给知乎服务器带来太大压力,可以日间调试好,然后深夜运行爬取人少, 给其他小伙伴更好的用户体验, 避免知乎顺着网线过来找人,默认**6**s<br>
3、若是一直停在登录页面可能是之前保存的cookie失效了需要再次登录保存cookie
### blogs ### blogs
[https://www.aliyundrive.com/s/NikyVRJq8JV 阿里云分享的](https://www.aliyundrive.com/s/NikyVRJq8JV) `提取 0h3l` <br> [https://www.aliyundrive.com/s/NikyVRJq8JV 阿里云分享的](https://www.aliyundrive.com/s/NikyVRJq8JV) `提取 0h3l` <br>

View File

@ -345,10 +345,16 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
if 'class' in chi.attrs.keys(): if 'class' in chi.attrs.keys():
classc = chi.attrs["class"] classc = chi.attrs["class"]
if datatex and classc and 'ztext-math' in classc: if datatex and classc and 'ztext-math' in classc:
if article[-3-1:]=='<br>' or article[-1:]=='\n': content = chi.attrs["data-tex"]
article += "\n$" + chi.attrs["data-tex"] + "$" while len(content) > 0 and ' '==content[0]:
else: content = content[1:]
article += "$" + chi.attrs["data-tex"] + "$" while len(content) > 0 and ' '==content[-1]:
content = content[:-1]
if len(content) > 0:
if article[-3-1:]=='<br>' or article[-1:]=='\n':
article += "\n$" + content + "$"
else:
article += "$" + content + "$"
else: else:
article, number = parser_beautiful(chi, article, number, dircrea, bk) article, number = parser_beautiful(chi, article, number, dircrea, bk)
# article += nod.text # article += nod.text
@ -404,6 +410,9 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
if len(prenode) > 0: if len(prenode) > 0:
for i in prenode: for i in prenode:
article += "\n\n```\n" + i.text + "\n```\n\n" article += "\n\n```\n" + i.text + "\n```\n\n"
else:
article, number = parser_beautiful(chi, article, number, dircrea, bk)
article += "\n\n"
if bk: if bk:
article += "**" article += "**"
return article, number return article, number
@ -916,7 +925,14 @@ def login_loadsavecookie():
try: try:
load_cookie(driver, cookie_path) load_cookie(driver, cookie_path)
driver.get(r"https://www.zhihu.com/") driver.get(r"https://www.zhihu.com/")
except: WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.ID, 'Popover15-toggle'))
toggle = driver.find_element(By.ID, 'Popover15-toggle')
except Exception as e:
if os.path.exists(cookie_path):
os.remove(cookie_path)
print("浏览器cookie失效了删除了之前的cookie需要再次登录并保存cookie。")
else:
print("需要登陆并保存cookie下次就不用登录了。")
driver = login(driver) driver = login(driver)
save_cookie(driver, cookie_path) save_cookie(driver, cookie_path)
driver.quit() driver.quit()
@ -934,10 +950,24 @@ def login_loadsavecookie():
username = url.split("/")[-1] username = url.split("/")[-1]
return driver, username return driver, username
def zhihu(): def downloaddriver():
# #crawl articles links url = "https://msedgedriver.azureedge.net/116.0.1938.62/edgedriver_win64.zip"
if not os.path.exists(driverpath): if not os.path.exists(driverpath):
response = requests.get("https://msedgedriver.azureedge.net/114.0.1823.67/edgedriver_win64.zip") ret = requests.get("https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/")
if ret.status_code!=200:
assert ret.status_code!=200
ret = BeautifulSoup(ret.content, 'html.parser')
# divall = ret.find_all('div', class_=r'common-card--lightblue')
ddl = ret.find_all('a')
for k in ddl:
key = k.attrs.keys()
if 'href' not in key:
continue
href = k.attrs['href']
if 'href' in key and "win64" in href and ".zip" in href:
url = href
break
response = requests.get(url)
if response.status_code==200: if response.status_code==200:
with open(os.path.join(abspath, 'msedgedriver/edgedriver.zip'), 'wb') as obj: with open(os.path.join(abspath, 'msedgedriver/edgedriver.zip'), 'wb') as obj:
obj.write(response.content) obj.write(response.content)
@ -958,7 +988,15 @@ def zhihu():
if kk < 0: if kk < 0:
break break
driver, username = login_loadsavecookie() def zhihu():
# #crawl articles links
try:
downloaddriver()
driver, username = login_loadsavecookie()
except Exception as e:
os.remove(os.path.join(abspath, 'msedgedriver', "msedgedriver.exe"))
downloaddriver()
driver, username = login_loadsavecookie()
# #crawl think links # #crawl think links
if crawl_think: if crawl_think:
@ -1058,4 +1096,4 @@ if __name__ == "__main__":
# except: # except:
# time.sleep(600) # time.sleep(600)
# zhihu() # zhihu()
logfp.close() logfp.close()