Compare commits
10 Commits
2546508a01
...
ee0478010d
Author | SHA1 | Date |
---|---|---|
ZouJiu | ee0478010d | |
ZouJiu | 7d0e35d4f0 | |
ZouJiu | b9c496567f | |
ZouJiu | 7d40cf6c41 | |
ZouJiu | 4cf647fe6a | |
ZouJiu | 17055b25e9 | |
九是否随意的称呼 | ef9f67d93d | |
ZouJiu | f7f59b53d9 | |
ZouJiu | b89f9a70e3 | |
ZouJiu | 469efcc0ce |
|
@ -106,6 +106,7 @@ python.exe crawler.py --think --article --answer --MarkDown
|
||||||
### 注意
|
### 注意
|
||||||
1、需要较好的网速,本机网速测验是下载100Mbps,上传60Mbps,低点也可以的,不是太慢太卡就行[https://www.speedtest.cn/](https://www.speedtest.cn/)<br>
|
1、需要较好的网速,本机网速测验是下载100Mbps,上传60Mbps,低点也可以的,不是太慢太卡就行[https://www.speedtest.cn/](https://www.speedtest.cn/)<br>
|
||||||
2、爬取时设置了睡眠时间, 避免给知乎服务器带来太大压力,可以日间调试好,然后深夜运行爬取人少, 给其他小伙伴更好的用户体验, 避免知乎顺着网线过来找人,默认**6**s<br>
|
2、爬取时设置了睡眠时间, 避免给知乎服务器带来太大压力,可以日间调试好,然后深夜运行爬取人少, 给其他小伙伴更好的用户体验, 避免知乎顺着网线过来找人,默认**6**s<br>
|
||||||
|
3、若是一直停在登录页面,可能是之前保存的cookie失效了,需要再次登录保存cookie
|
||||||
|
|
||||||
### blogs
|
### blogs
|
||||||
[https://www.aliyundrive.com/s/NikyVRJq8JV 阿里云分享的](https://www.aliyundrive.com/s/NikyVRJq8JV) `提取 0h3l` <br>
|
[https://www.aliyundrive.com/s/NikyVRJq8JV 阿里云分享的](https://www.aliyundrive.com/s/NikyVRJq8JV) `提取 0h3l` <br>
|
||||||
|
|
58
crawler.py
58
crawler.py
|
@ -345,10 +345,16 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
|
||||||
if 'class' in chi.attrs.keys():
|
if 'class' in chi.attrs.keys():
|
||||||
classc = chi.attrs["class"]
|
classc = chi.attrs["class"]
|
||||||
if datatex and classc and 'ztext-math' in classc:
|
if datatex and classc and 'ztext-math' in classc:
|
||||||
if article[-3-1:]=='<br>' or article[-1:]=='\n':
|
content = chi.attrs["data-tex"]
|
||||||
article += "\n$" + chi.attrs["data-tex"] + "$"
|
while len(content) > 0 and ' '==content[0]:
|
||||||
else:
|
content = content[1:]
|
||||||
article += "$" + chi.attrs["data-tex"] + "$"
|
while len(content) > 0 and ' '==content[-1]:
|
||||||
|
content = content[:-1]
|
||||||
|
if len(content) > 0:
|
||||||
|
if article[-3-1:]=='<br>' or article[-1:]=='\n':
|
||||||
|
article += "\n$" + content + "$"
|
||||||
|
else:
|
||||||
|
article += "$" + content + "$"
|
||||||
else:
|
else:
|
||||||
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
# article += nod.text
|
# article += nod.text
|
||||||
|
@ -404,6 +410,9 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
|
||||||
if len(prenode) > 0:
|
if len(prenode) > 0:
|
||||||
for i in prenode:
|
for i in prenode:
|
||||||
article += "\n\n```\n" + i.text + "\n```\n\n"
|
article += "\n\n```\n" + i.text + "\n```\n\n"
|
||||||
|
else:
|
||||||
|
article, number = parser_beautiful(chi, article, number, dircrea, bk)
|
||||||
|
article += "\n\n"
|
||||||
if bk:
|
if bk:
|
||||||
article += "**"
|
article += "**"
|
||||||
return article, number
|
return article, number
|
||||||
|
@ -916,7 +925,14 @@ def login_loadsavecookie():
|
||||||
try:
|
try:
|
||||||
load_cookie(driver, cookie_path)
|
load_cookie(driver, cookie_path)
|
||||||
driver.get(r"https://www.zhihu.com/")
|
driver.get(r"https://www.zhihu.com/")
|
||||||
except:
|
WebDriverWait(driver, timeout=10).until(lambda d: d.find_element(By.ID, 'Popover15-toggle'))
|
||||||
|
toggle = driver.find_element(By.ID, 'Popover15-toggle')
|
||||||
|
except Exception as e:
|
||||||
|
if os.path.exists(cookie_path):
|
||||||
|
os.remove(cookie_path)
|
||||||
|
print("浏览器cookie失效了,删除了之前的cookie,需要再次登录并保存cookie。")
|
||||||
|
else:
|
||||||
|
print("需要登陆并保存cookie,下次就不用登录了。")
|
||||||
driver = login(driver)
|
driver = login(driver)
|
||||||
save_cookie(driver, cookie_path)
|
save_cookie(driver, cookie_path)
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
@ -934,10 +950,24 @@ def login_loadsavecookie():
|
||||||
username = url.split("/")[-1]
|
username = url.split("/")[-1]
|
||||||
return driver, username
|
return driver, username
|
||||||
|
|
||||||
def zhihu():
|
def downloaddriver():
|
||||||
# #crawl articles links
|
url = "https://msedgedriver.azureedge.net/116.0.1938.62/edgedriver_win64.zip"
|
||||||
if not os.path.exists(driverpath):
|
if not os.path.exists(driverpath):
|
||||||
response = requests.get("https://msedgedriver.azureedge.net/114.0.1823.67/edgedriver_win64.zip")
|
ret = requests.get("https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/")
|
||||||
|
if ret.status_code!=200:
|
||||||
|
assert ret.status_code!=200
|
||||||
|
ret = BeautifulSoup(ret.content, 'html.parser')
|
||||||
|
# divall = ret.find_all('div', class_=r'common-card--lightblue')
|
||||||
|
ddl = ret.find_all('a')
|
||||||
|
for k in ddl:
|
||||||
|
key = k.attrs.keys()
|
||||||
|
if 'href' not in key:
|
||||||
|
continue
|
||||||
|
href = k.attrs['href']
|
||||||
|
if 'href' in key and "win64" in href and ".zip" in href:
|
||||||
|
url = href
|
||||||
|
break
|
||||||
|
response = requests.get(url)
|
||||||
if response.status_code==200:
|
if response.status_code==200:
|
||||||
with open(os.path.join(abspath, 'msedgedriver/edgedriver.zip'), 'wb') as obj:
|
with open(os.path.join(abspath, 'msedgedriver/edgedriver.zip'), 'wb') as obj:
|
||||||
obj.write(response.content)
|
obj.write(response.content)
|
||||||
|
@ -958,7 +988,15 @@ def zhihu():
|
||||||
if kk < 0:
|
if kk < 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
driver, username = login_loadsavecookie()
|
def zhihu():
|
||||||
|
# #crawl articles links
|
||||||
|
try:
|
||||||
|
downloaddriver()
|
||||||
|
driver, username = login_loadsavecookie()
|
||||||
|
except Exception as e:
|
||||||
|
os.remove(os.path.join(abspath, 'msedgedriver', "msedgedriver.exe"))
|
||||||
|
downloaddriver()
|
||||||
|
driver, username = login_loadsavecookie()
|
||||||
|
|
||||||
# #crawl think links
|
# #crawl think links
|
||||||
if crawl_think:
|
if crawl_think:
|
||||||
|
@ -1058,4 +1096,4 @@ if __name__ == "__main__":
|
||||||
# except:
|
# except:
|
||||||
# time.sleep(600)
|
# time.sleep(600)
|
||||||
# zhihu()
|
# zhihu()
|
||||||
logfp.close()
|
logfp.close()
|
||||||
|
|
Loading…
Reference in New Issue