add cookie login
parent
78aafe5689
commit
53d81fd8e9
16
README.md
16
README.md
|
@ -3,17 +3,27 @@
|
||||||
> 主要功能:爬取 csdn 博客指定用户的所有博文并转换为 markdown 格式保存到本地。
|
> 主要功能:爬取 csdn 博客指定用户的所有博文并转换为 markdown 格式保存到本地。
|
||||||
|
|
||||||
## 下载脚本
|
## 下载脚本
|
||||||
```
|
```shell
|
||||||
git clone https://github.com/ds19991999/csdn-spider.git
|
git clone https://github.com/ds19991999/csdn-spider.git
|
||||||
cd csdn-spider
|
cd csdn-spider
|
||||||
python3 -m pip install -r requirements.txt
|
python3 -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
# 测试
|
||||||
|
python3 test.py # 需要先配置登录 cookie
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 获取 cookie
|
||||||
|
|
||||||
|
登录 `csdn` 账号,进入:https://blog.csdn.net ,按 `F12` 调试网页,复制所有的 `Request Headers`,保存到`cookie.txt`文件中
|
||||||
|
|
||||||
|
![1571482112632](assets/1571482112632.png)
|
||||||
|
|
||||||
|
|
||||||
## 爬取用户全部博文
|
## 爬取用户全部博文
|
||||||
```python
|
```python
|
||||||
import csdn
|
import csdn
|
||||||
csdn.spider(["ds19991999", "u013088062"], 5)
|
csdn.spider(["ds19991999", "u013088062"], "cookie.txt",5)
|
||||||
# 参数 usernames: list, thread_num: int = 10, folder_name: str = "articles"
|
# 参数 usernames: list, cookie_path:str, thread_num: int = 10, folder_name: str = "articles"
|
||||||
```
|
```
|
||||||
|
|
||||||
## LICENSE
|
## LICENSE
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 397 KiB |
|
@ -0,0 +1,14 @@
|
||||||
|
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
|
||||||
|
Accept-Encoding: gzip, deflate, br
|
||||||
|
Accept-Language: zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ar-XB;q=0.6,ar;q=0.5,und;q=0.4,fr;q=0.3,zh-TW;q=0.2,co;q=0.1
|
||||||
|
Cache-Control: max-age=0
|
||||||
|
Connection: keep-alive
|
||||||
|
Cookie: uuid_tt_dd=10_19036043710-1566202960198-929717; dc_session_id=10_1566202960198.130586; smidV2=20190822144441a32b24b82db3e637ea0d909effad9e97008cdaa3064a9c6c0; UN=username; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_19036043710-1566202960198-929717!1788*1*PC_VC!5744*1*username; __yadk_uid=uid; Hm_ct_e5ef47b9f471504959267fd614d579cd=5744*1*username!6525*1*10_19036043710-1566202960198-929717; Hm_lvt_05d2e527c7434da7fe2a2083f260121f=1570706886; Hm_ct_05d2e527c7434da7fe2a2083f260121f=5744*1*username!6525*1*10_19036043710-1566202960198-929717; Hm_ct_4a20bfe8e339184241f52b1b2c53e116=5744*1*username!6525*1*10_19036043710-1566202960198-929717; Hm_lvt_7baff5bb3ade13044599f3b4bf3d5adf=1570848963; Hm_ct_7baff5bb3ade13044599f3b4bf3d5adf=5744*1*username!6525*1*10_19036043710-1566202960198-929717; Hm_lvt_e5ef47b9f471504959267fd614d579cd=1570700925,1570852128; Hm_lvt_4a20bfe8e339184241f52b1b2c53e116=1570843980,1570873265; acw_tc=2760828a15731191947875934e10044c2614f0ad2a40881b50e19c2b892610; hasSub=true; c_adb=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1571484485,1573119199,1573178155,1573183521; acw_sc__v3=5dc4e0221baaef4e53f90094f75d2db95c9ba704; acw_sc__v2=5dc4e21e400b81dedb4a973df7d31ded93647980; TY_SESSION_ID=796586fa-bc9f-4612-9700-a0f28a98408b; SESSION=443bcab2-fde2-427b-b1dd-f28cdb457e08; UserName=username; UserInfo=yourtoken; UserToken=yourtoken; UserNick=username; AU=2CA; BT=1573184042504; p_uid=U000000; announcement=%257B%2522isLogin%2522%253Atrue%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblogdev.blog.csdn.net%252Farticle%252Fdetails%252F102605809%2522%252C%2522announcementCount%2522%253A0%252C%2522announcementExpire%2522%253A3600000%257D; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1573184082; dc_tos=q0mslu
|
||||||
|
DNT: 1
|
||||||
|
Host: blog.csdn.net
|
||||||
|
Referer: https://passport.csdn.net/login?code=public
|
||||||
|
Sec-Fetch-Mode: navigate
|
||||||
|
Sec-Fetch-Site: same-site
|
||||||
|
Sec-Fetch-User: ?1
|
||||||
|
Upgrade-Insecure-Requests: 1
|
||||||
|
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36
|
40
csdn/csdn.py
40
csdn/csdn.py
|
@ -14,7 +14,10 @@ from .tomd import Tomd
|
||||||
def result_file(folder_username, file_name, folder_name):
|
def result_file(folder_username, file_name, folder_name):
|
||||||
folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", folder_name, folder_username)
|
folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", folder_name, folder_username)
|
||||||
if not os.path.exists(folder):
|
if not os.path.exists(folder):
|
||||||
os.makedirs(folder)
|
try:
|
||||||
|
os.makedirs(folder)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
path = os.path.join(folder, file_name)
|
path = os.path.join(folder, file_name)
|
||||||
file = open(path,"w")
|
file = open(path,"w")
|
||||||
file.close()
|
file.close()
|
||||||
|
@ -23,6 +26,16 @@ def result_file(folder_username, file_name, folder_name):
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def get_headers(cookie_path:str):
|
||||||
|
cookies = {}
|
||||||
|
with open(cookie_path, "r", encoding="utf-8") as f:
|
||||||
|
cookie_list = f.readlines()
|
||||||
|
for line in cookie_list:
|
||||||
|
cookie = line.split(":")
|
||||||
|
cookies[cookie[0]] = str(cookie[1]).strip()
|
||||||
|
return cookies
|
||||||
|
|
||||||
|
|
||||||
def delete_ele(soup:BeautifulSoup, tags:list):
|
def delete_ele(soup:BeautifulSoup, tags:list):
|
||||||
for ele in tags:
|
for ele in tags:
|
||||||
for useless_tag in soup.select(ele):
|
for useless_tag in soup.select(ele):
|
||||||
|
@ -80,10 +93,12 @@ class TaskQueue(object):
|
||||||
|
|
||||||
|
|
||||||
class CSDN(object):
|
class CSDN(object):
|
||||||
def __init__(self, username, folder_name):
|
def __init__(self, username, folder_name, cookie_path):
|
||||||
self.headers = {
|
# self.headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
|
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
|
||||||
}
|
# }
|
||||||
|
self.headers = get_headers(cookie_path)
|
||||||
|
self.s = requests.Session()
|
||||||
self.username = username
|
self.username = username
|
||||||
self.TaskQueue = TaskQueue()
|
self.TaskQueue = TaskQueue()
|
||||||
self.folder_name = folder_name
|
self.folder_name = folder_name
|
||||||
|
@ -95,7 +110,7 @@ class CSDN(object):
|
||||||
while len(articles) > 0:
|
while len(articles) > 0:
|
||||||
num += 1
|
num += 1
|
||||||
url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num)
|
url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num)
|
||||||
response = requests.get(url=url, headers=self.headers)
|
response = self.s.get(url=url, headers=self.headers)
|
||||||
html = response.text
|
html = response.text
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
|
articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
|
||||||
|
@ -106,7 +121,7 @@ class CSDN(object):
|
||||||
self.TaskQueue.InsertUnVisitedList([article_title, article_href])
|
self.TaskQueue.InsertUnVisitedList([article_title, article_href])
|
||||||
|
|
||||||
def get_md(self, url):
|
def get_md(self, url):
|
||||||
response = requests.get(url=url, headers=self.headers)
|
response = self.s.get(url=url, headers=self.headers)
|
||||||
html = response.text
|
html = response.text
|
||||||
soup = BeautifulSoup(html, 'lxml')
|
soup = BeautifulSoup(html, 'lxml')
|
||||||
content = soup.select_one("#content_views")
|
content = soup.select_one("#content_views")
|
||||||
|
@ -185,10 +200,10 @@ def ensure_memory(size):
|
||||||
total_mem += size
|
total_mem += size
|
||||||
|
|
||||||
|
|
||||||
def spider_user(username: str, thread_num: int = 10, folder_name: str = "articles"):
|
def spider_user(username: str, cookie_path:str, thread_num: int = 10, folder_name: str = "articles"):
|
||||||
if not os.path.exists(folder_name):
|
if not os.path.exists(folder_name):
|
||||||
os.makedirs(folder_name)
|
os.makedirs(folder_name)
|
||||||
csdn = CSDN(username,folder_name)
|
csdn = CSDN(username, folder_name, cookie_path)
|
||||||
csdn.start()
|
csdn.start()
|
||||||
th1 = threading.Thread(target=csdn.write_readme)
|
th1 = threading.Thread(target=csdn.write_readme)
|
||||||
th1.start()
|
th1.start()
|
||||||
|
@ -196,15 +211,12 @@ def spider_user(username: str, thread_num: int = 10, folder_name: str = "article
|
||||||
th2.start()
|
th2.start()
|
||||||
|
|
||||||
|
|
||||||
def spider(usernames: list, thread_num: int = 10, folder_name: str = "articles"):
|
def spider(usernames: list, cookie_path:str, thread_num: int = 10, folder_name: str = "articles"):
|
||||||
for username in usernames:
|
for username in usernames:
|
||||||
try:
|
try:
|
||||||
user_thread = threading.Thread(target=spider_user,args=(username, thread_num, folder_name))
|
user_thread = threading.Thread(target=spider_user,args=(username, cookie_path, thread_num, folder_name))
|
||||||
user_thread.start()
|
user_thread.start()
|
||||||
print("[++] 开启爬取 {} 博文进程成功 ......".format(username))
|
print("[++] 开启爬取 {} 博文进程成功 ......".format(username))
|
||||||
except Exception:
|
except Exception:
|
||||||
print("[--] 开启爬取 {} 博文进程出现异常 ......".format(username))
|
print("[--] 开启爬取 {} 博文进程出现异常 ......".format(username))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
spider(["ds19991999"])
|
|
||||||
|
|
Loading…
Reference in New Issue