add cookie login

master
ds19991999 2019-11-08 11:57:43 +08:00
parent 78aafe5689
commit 53d81fd8e9
5 changed files with 54 additions and 18 deletions

View File

@ -3,17 +3,27 @@
> 主要功能:爬取 csdn 博客指定用户的所有博文并转换为 markdown 格式保存到本地。
## 下载脚本
```
```shell
git clone https://github.com/ds19991999/csdn-spider.git
cd csdn-spider
python3 -m pip install -r requirements.txt
# 测试
python3 test.py # 需要先配置登录 cookie
```
## 获取 cookie
登录 `csdn` 账号进入https://blog.csdn.net ,按 `F12` 调试网页,复制所有的 `Request Headers`,保存到`cookie.txt`文件中
![1571482112632](assets/1571482112632.png)
## 爬取用户全部博文
```python
import csdn
csdn.spider(["ds19991999", "u013088062"], 5)
# 参数 usernames: list, thread_num: int = 10, folder_name: str = "articles"
csdn.spider(["ds19991999", "u013088062"], "cookie.txt",5)
# 参数 usernames: list, cookie_path:str, thread_num: int = 10, folder_name: str = "articles"
```
## LICENSE

BIN
assets/1571482112632.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 397 KiB

14
cookie.txt Normal file
View File

@ -0,0 +1,14 @@
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ar-XB;q=0.6,ar;q=0.5,und;q=0.4,fr;q=0.3,zh-TW;q=0.2,co;q=0.1
Cache-Control: max-age=0
Connection: keep-alive
Cookie: uuid_tt_dd=10_19036043710-1566202960198-929717; dc_session_id=10_1566202960198.130586; smidV2=20190822144441a32b24b82db3e637ea0d909effad9e97008cdaa3064a9c6c0; UN=username; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_19036043710-1566202960198-929717!1788*1*PC_VC!5744*1*username; __yadk_uid=uid; Hm_ct_e5ef47b9f471504959267fd614d579cd=5744*1*username!6525*1*10_19036043710-1566202960198-929717; Hm_lvt_05d2e527c7434da7fe2a2083f260121f=1570706886; Hm_ct_05d2e527c7434da7fe2a2083f260121f=5744*1*username!6525*1*10_19036043710-1566202960198-929717; Hm_ct_4a20bfe8e339184241f52b1b2c53e116=5744*1*username!6525*1*10_19036043710-1566202960198-929717; Hm_lvt_7baff5bb3ade13044599f3b4bf3d5adf=1570848963; Hm_ct_7baff5bb3ade13044599f3b4bf3d5adf=5744*1*username!6525*1*10_19036043710-1566202960198-929717; Hm_lvt_e5ef47b9f471504959267fd614d579cd=1570700925,1570852128; Hm_lvt_4a20bfe8e339184241f52b1b2c53e116=1570843980,1570873265; acw_tc=2760828a15731191947875934e10044c2614f0ad2a40881b50e19c2b892610; hasSub=true; c_adb=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1571484485,1573119199,1573178155,1573183521; acw_sc__v3=5dc4e0221baaef4e53f90094f75d2db95c9ba704; acw_sc__v2=5dc4e21e400b81dedb4a973df7d31ded93647980; TY_SESSION_ID=796586fa-bc9f-4612-9700-a0f28a98408b; SESSION=443bcab2-fde2-427b-b1dd-f28cdb457e08; UserName=username; UserInfo=yourtoken; UserToken=yourtoken; UserNick=username; AU=2CA; BT=1573184042504; p_uid=U000000; announcement=%257B%2522isLogin%2522%253Atrue%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblogdev.blog.csdn.net%252Farticle%252Fdetails%252F102605809%2522%252C%2522announcementCount%2522%253A0%252C%2522announcementExpire%2522%253A3600000%257D; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1573184082; dc_tos=q0mslu
DNT: 1
Host: blog.csdn.net
Referer: https://passport.csdn.net/login?code=public
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: same-site
Sec-Fetch-User: ?1
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36

View File

@ -14,7 +14,10 @@ from .tomd import Tomd
def result_file(folder_username, file_name, folder_name):
folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", folder_name, folder_username)
if not os.path.exists(folder):
os.makedirs(folder)
try:
os.makedirs(folder)
except Exception:
pass
path = os.path.join(folder, file_name)
file = open(path,"w")
file.close()
@ -23,6 +26,16 @@ def result_file(folder_username, file_name, folder_name):
return path
def get_headers(cookie_path:str):
cookies = {}
with open(cookie_path, "r", encoding="utf-8") as f:
cookie_list = f.readlines()
for line in cookie_list:
cookie = line.split(":")
cookies[cookie[0]] = str(cookie[1]).strip()
return cookies
def delete_ele(soup:BeautifulSoup, tags:list):
for ele in tags:
for useless_tag in soup.select(ele):
@ -80,10 +93,12 @@ class TaskQueue(object):
class CSDN(object):
def __init__(self, username, folder_name):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
}
def __init__(self, username, folder_name, cookie_path):
# self.headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
# }
self.headers = get_headers(cookie_path)
self.s = requests.Session()
self.username = username
self.TaskQueue = TaskQueue()
self.folder_name = folder_name
@ -95,7 +110,7 @@ class CSDN(object):
while len(articles) > 0:
num += 1
url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num)
response = requests.get(url=url, headers=self.headers)
response = self.s.get(url=url, headers=self.headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
@ -106,7 +121,7 @@ class CSDN(object):
self.TaskQueue.InsertUnVisitedList([article_title, article_href])
def get_md(self, url):
response = requests.get(url=url, headers=self.headers)
response = self.s.get(url=url, headers=self.headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
content = soup.select_one("#content_views")
@ -185,10 +200,10 @@ def ensure_memory(size):
total_mem += size
def spider_user(username: str, thread_num: int = 10, folder_name: str = "articles"):
def spider_user(username: str, cookie_path:str, thread_num: int = 10, folder_name: str = "articles"):
if not os.path.exists(folder_name):
os.makedirs(folder_name)
csdn = CSDN(username,folder_name)
csdn = CSDN(username, folder_name, cookie_path)
csdn.start()
th1 = threading.Thread(target=csdn.write_readme)
th1.start()
@ -196,15 +211,12 @@ def spider_user(username: str, thread_num: int = 10, folder_name: str = "article
th2.start()
def spider(usernames: list, thread_num: int = 10, folder_name: str = "articles"):
def spider(usernames: list, cookie_path:str, thread_num: int = 10, folder_name: str = "articles"):
for username in usernames:
try:
user_thread = threading.Thread(target=spider_user,args=(username, thread_num, folder_name))
user_thread = threading.Thread(target=spider_user,args=(username, cookie_path, thread_num, folder_name))
user_thread.start()
print("[++] 开启爬取 {} 博文进程成功 ......".format(username))
except Exception:
print("[--] 开启爬取 {} 博文进程出现异常 ......".format(username))
if __name__ == "__main__":
spider(["ds19991999"])

View File

@ -4,4 +4,4 @@
import csdn
if __name__ == "__main__":
csdn.spider(["ds19991999", "u013088062"], 5)
csdn.spider(["ds19991999", "u013088062"], "cookie.txt",5)