diff --git a/CSDN/__init__.py b/CSDN/__init__.py new file mode 100644 index 0000000..5d109ba --- /dev/null +++ b/CSDN/__init__.py @@ -0,0 +1,2 @@ +from .csdn import run +from .csdn import CSDN diff --git a/CSDN/csdn.py b/CSDN/csdn.py new file mode 100644 index 0000000..f0edc4b --- /dev/null +++ b/CSDN/csdn.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os, time, re +import requests +import threading +from bs4 import BeautifulSoup, Comment +from .tomd import Tomd + + +def result_file(folder_name, file_name, article_name): + folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../"+article_name, folder_name) + if not os.path.exists(folder): + os.makedirs(folder) + path = os.path.join(folder, file_name) + file = open(path,"w") + file.close() + else: + path = os.path.join(folder, file_name) + return path + + +def delete_ele(soup:BeautifulSoup, tags:list): + for ele in tags: + for useless_tag in soup.select(ele): + useless_tag.decompose() + + +def delete_ele_attr(soup:BeautifulSoup, attrs:list): + for attr in attrs: + for useless_attr in soup.find_all(): + del useless_attr[attr] + + +def delete_blank_ele(soup:BeautifulSoup, eles_except:list): + for useless_attr in soup.find_all(): + try: + if useless_attr.name not in eles_except and useless_attr.text == "": + useless_attr.decompose() + except Exception: + pass + + +class TaskQueue(object): + def __init__(self): + self.VisitedList = [] + self.UnVisitedList = [] + + def getVisitedList(self): + return self.VisitedList + + def getUnVisitedList(self): + return self.UnVisitedList + + def InsertVisitedList(self, url): + if url not in self.VisitedList: + self.VisitedList.append(url) + + def InsertUnVisitedList(self, url): + if url not in self.UnVisitedList: + self.UnVisitedList.append(url) + + def RemoveVisitedList(self, url): + self.VisitedList.remove(url) + + def PopUnVisitedList(self,index=0): + url = [] + if index and self.UnVisitedList: + url = self.UnVisitedList[index] + del self.UnVisitedList[:index] + elif self.UnVisitedList: + url = self.UnVisitedList.pop() + return url + + def getUnVisitedListLength(self): + return len(self.UnVisitedList) + + +class CSDN(object): + def __init__(self, username, article__folder_name): + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36" + } + self.username = username + self.TaskQueue = TaskQueue() + self.article__folder_name = article__folder_name + self.url_num = 1 + + def start(self): + """获取文章标题和链接""" + num = 0 + while True: + num += 1 + url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num) + response = requests.get(url=url, headers=self.headers) + html = response.text + soup = BeautifulSoup(html, "html.parser") + articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"}) + if len(articles) > 0: + for article in articles: + article_title = article.a.text.strip().replace(' ',': ') + article_href = article.a['href'] + self.TaskQueue.InsertUnVisitedList([article_title, article_href]) + else: + break + + def get_md(self, url): + """爬取文章""" + response = requests.get(url=url, headers=self.headers) + html = response.text + soup = BeautifulSoup(html, 'lxml') + content = soup.select_one("#content_views") + # 删除注释 + for useless_tag in content(text=lambda text: isinstance(text, Comment)): + useless_tag.extract() + # 删除无用标签 + tags = ["svg", "ul", ".hljs-button.signin"] + delete_ele(content, tags) + # 删除标签属性 + attrs = ["class", "name", "id", "onclick", "style", "data-token", "rel"] + delete_ele_attr(content,attrs) + # 删除空白标签 + eles_except = ["img", "br", "hr"] + delete_blank_ele(content, eles_except) + # 转换为markdown + md = Tomd(str(content)).markdown + return md + + + def write_readme(self): + """生成readme""" + print("[++] 正在爬取 {} 的博文 ......".format(self.username)) + reademe_path = result_file(self.username,file_name="README.md",article_name=self.article__folder_name) + with open(reademe_path,'w', encoding='utf-8') as reademe_file: + readme_head = "# " + self.username + " 的博文\n" + reademe_file.write(readme_head) + for [article_title,article_href] in self.TaskQueue.UnVisitedList[::-1]: + text = str(self.url_num) + '. [' + article_title + ']('+ article_href +')\n' + reademe_file.write(text) + self.url_num += 1 + self.url_num = 1 + + def spider(self): + """爬取所有文章""" + try: + while True: + [article_title,article_href] = self.TaskQueue.PopUnVisitedList() + try: + print("[++++] 正在处理URL:{}".format(article_href)) + file_name = re.sub(r'[\/::*?"<>|]','-', article_title) + ".md" + artical_path = result_file(folder_name=self.username, file_name=file_name, article_name=self.article__folder_name) + md_head = "# " + article_title + "\n" + md = md_head + self.get_md(article_href) + with open(artical_path, "w", encoding="utf-8") as artical_file: + artical_file.write(md) + except Exception: + print("[----] 处理URL异常:{}".format(article_href)) + self.url_num += 1 + except Exception: + pass + + def muti_spider(self, thread_num): + while True: + if self.TaskQueue.getUnVisitedListLength() < 1: + break + thread_list = [] + for i in range(thread_num): + th = threading.Thread(target=self.spider) + thread_list.append(th) + for th in thread_list: + th.start() + + + +def run(username: str = "ds19991999", thread_num: int = 10, article__folder_name: str = "articles"): + if not os.path.exists(article__folder_name): + os.makedirs(article__folder_name) + csdn = CSDN(username,article__folder_name) + csdn.start() + csdn.write_readme() + csdn.muti_spider(thread_num) + + +if __name__ == "__main__": + run("ds19991999", 10, "articles") + diff --git a/tomd.py b/CSDN/tomd.py similarity index 100% rename from tomd.py rename to CSDN/tomd.py diff --git a/README.md b/README.md index f942079..4ed07c3 100644 --- a/README.md +++ b/README.md @@ -1,62 +1,28 @@ -# CSDN 爬虫脚本 +# CSDN 爬虫 -主要功能:爬取 `csdn` 博客指定用户的所有博文并转换为 `markdown` 格式保存到本地。 +> 主要功能:爬取 csdn 博客指定用户的所有博文并转换为 markdown 格式保存到本地。 -## 一、运行环境 - -需要安装`WebDriver`驱动,https://chromedriver.chromium.org/downloads ,下载与本地对应的`chrome`驱动后,将其添加至环境变量`$PATH` - -```shell -python3 +## 下载脚本 +``` +git clone https://github.com/ds19991999/csdn-spider.git +cd csdn-spider python3 -m pip install -r requirements.txt ``` -## 二、获取脚本 +## 爬取用户全部博文 +```python +#!/usr/bin/env python +# coding: utf-8 -```shell -git clone https://github.com/ds19991999/csdn-spider.git +if __name__ == "__main__": + import CSDN + CSDN.run("ds19991999") ``` -## 三、用法 - -### 1.获取cookie - -登录 `csdn` 账号,进入:https://blog.csdn.net ,按 `F12` 调试网页,复制所有的 `Request Headers`,保存到`cookie.txt`文件中 - -![1571482112632](assets/1571482112632.png) - -### 2.添加需要爬取的 `csdn` 用户 - -在`username.txt`中添加用户名,一行一个 - -### 3.运行脚本 - -```shell -python3 csdn.py -``` - -## 四、效果 - -**运行过程** - -![1571483423256](assets/1571483423256.png) - -**文章列表建立**:`./articles/username/README.md` - -![1571483552438](assets/1571483552438.png) - -**爬取的博文**:`./articles/username/` - -![1571483479356](assets/1571483479356.png) - -**博文转换效果**: - -![1571483777703](assets/1571483777703.png) - -## 五、LICENSE +## LICENSE Creative Commons License -`PS`:随意写的爬虫脚本,佛系更新。 +`PS`:随意写的爬虫脚本,佛系更新。 \ No newline at end of file diff --git a/assets/1571482112632.png b/assets/1571482112632.png deleted file mode 100644 index 452bbc4..0000000 Binary files a/assets/1571482112632.png and /dev/null differ diff --git a/assets/1571483423256.png b/assets/1571483423256.png deleted file mode 100644 index 1fbe4b6..0000000 Binary files a/assets/1571483423256.png and /dev/null differ diff --git a/assets/1571483479356.png b/assets/1571483479356.png deleted file mode 100644 index 1ecf4ee..0000000 Binary files a/assets/1571483479356.png and /dev/null differ diff --git a/assets/1571483552438.png b/assets/1571483552438.png deleted file mode 100644 index 1f71494..0000000 Binary files a/assets/1571483552438.png and /dev/null differ diff --git a/assets/1571483777703.png b/assets/1571483777703.png deleted file mode 100644 index 24b84e2..0000000 Binary files a/assets/1571483777703.png and /dev/null differ diff --git a/cookie.txt b/cookie.txt deleted file mode 100644 index e69de29..0000000 diff --git a/csdn.py b/csdn.py deleted file mode 100644 index 4a4f54e..0000000 --- a/csdn.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import os, time, re -import requests -import threading -from bs4 import BeautifulSoup, Comment -from selenium import webdriver -from tomd import Tomd - - -def result_file(folder_name, file_name): - folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "articles", folder_name) - if not os.path.exists(folder): - os.makedirs(folder) - path = os.path.join(folder, file_name) - file = open(path,"w") - file.close() - else: - path = os.path.join(folder, file_name) - return path - - -def get_headers(cookie_path:str): - cookies = {} - with open(cookie_path, "r", encoding="utf-8") as f: - cookie_list = f.readlines() - for line in cookie_list: - cookie = line.split(":") - cookies[cookie[0]] = str(cookie[1]).strip() - return cookies - - -def delete_ele(soup:BeautifulSoup, tags:list): - for ele in tags: - for useless_tag in soup.select(ele): - useless_tag.decompose() - - -def delete_ele_attr(soup:BeautifulSoup, attrs:list): - for attr in attrs: - for useless_attr in soup.find_all(): - del useless_attr[attr] - - -def delete_blank_ele(soup:BeautifulSoup, eles_except:list): - for useless_attr in soup.find_all(): - try: - if useless_attr.name not in eles_except and useless_attr.text == "": - useless_attr.decompose() - except Exception: - pass - - -class TaskQueue(object): - def __init__(self): - self.VisitedList = [] - self.UnVisitedList = [] - - def getVisitedList(self): - return self.VisitedList - - def getUnVisitedList(self): - return self.UnVisitedList - - def InsertVisitedList(self, url): - if url not in self.VisitedList: - self.VisitedList.append(url) - - def InsertUnVisitedList(self, url): - if url not in self.UnVisitedList: - self.UnVisitedList.append(url) - - def RemoveVisitedList(self, url): - self.VisitedList.remove(url) - - def PopUnVisitedList(self,index=0): - url = "" - if index and self.UnVisitedList: - url = self.UnVisitedList[index] - del self.UnVisitedList[:index] - elif self.UnVisitedList: - url = self.UnVisitedList.pop() - return url - - def getUnVisitedListLength(self): - return len(self.UnVisitedList) - - -class Article(object): - def __init__(self): - self.options = webdriver.ChromeOptions() - self.options.add_experimental_option('excludeSwitches', ['enable-logging']) - self.options.add_argument('headless') - self.browser = webdriver.Chrome(options=self.options) - # 设置全局智能等待时间 - self.browser.implicitly_wait(30) - - def get_content(self, url): - self.browser.get(url) - try: - self.browser.find_element_by_xpath('//a[@class="btn-readmore"]').click() - except Exception: - pass - content = self.browser.find_element_by_xpath('//div[@id="content_views"]').get_attribute("innerHTML") - return content - - def get_md(self, url): - """ - 转换为markdown格式 - """ - content = self.get_content(url) - soup = BeautifulSoup(content, 'lxml') - # 删除注释 - for useless_tag in soup(text=lambda text: isinstance(text, Comment)): - useless_tag.extract() - # 删除无用标签 - tags = ["svg", "ul", ".hljs-button.signin"] - delete_ele(soup, tags) - # 删除标签属性 - attrs = ["class", "name", "id", "onclick", "style", "data-token", "rel"] - delete_ele_attr(soup,attrs) - # 删除空白标签 - eles_except = ["img", "br", "hr"] - delete_blank_ele(soup, eles_except) - # 转换为markdown - md = Tomd(str(soup)).markdown - return md - - -class CSDN(object): - def __init__(self, cookie_path): - self.headers = get_headers(cookie_path) - self.TaskQueue = TaskQueue() - - def get_articles(self, username:str): - """获取文章标题和链接""" - num = 0 - while True: - num += 1 - url = u'https://blog.csdn.net/' + username + '/article/list/' + str(num) - response = requests.get(url=url, headers=self.headers) - html = response.text - soup = BeautifulSoup(html, "html.parser") - articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"}) - if len(articles) > 0: - for article in articles: - article_title = article.a.text.strip().replace(' ',':') - article_href = article.a['href'] - yield article_title,article_href - else: - break - - def write_articals(self, username:str): - """将博文写入本地""" - print("[++] 正在爬取 {} 的博文 ......".format(username)) - artical = Article() - reademe_path = result_file(username,file_name="README.md") - with open(reademe_path,'w', encoding='utf-8') as reademe_file: - i = 1 - readme_head = "# " + username + " 的博文\n" - reademe_file.write(readme_head) - for article_title,article_href in self.get_articles(username): - print("[++++] {}. 正在处理URL:{}".format(str(i), article_href)) - text = str(i) + '. [' + article_title + ']('+ article_href +')\n' - reademe_file.write(text) - file_name = str(i) + "." + re.sub(r'[\/::*?"<>|]','-', article_title) + ".md" - artical_path = result_file(folder_name=username, file_name=file_name) - try: - md_content = artical.get_md(article_href) - md_head = "# " + str(i) + "." + article_title + "\n" - md = md_head + md_content - with open(artical_path, "w", encoding="utf-8") as artical_file: - artical_file.write(md) - except Exception: - print("[----] {}. 处理URL异常:{}".format(str(i), article_href)) - i += 1 - # time.sleep(2) - - def spider(self): - """将爬取到的文章保存到本地""" - while True: - if self.TaskQueue.getUnVisitedListLength(): - username = self.TaskQueue.PopUnVisitedList() - self.write_articals(username) - - def check_user(self, user_path:str): - with open(user_path, 'r', encoding='utf-8') as f: - users = f.readlines() - for user in users: - self.TaskQueue.InsertUnVisitedList(user.strip()) - - def run(self, user_path): - UserThread = threading.Thread(target=self.check_user, args=(user_path,)) - SpiderThread = threading.Thread(target=self.spider, args=()) - UserThread.start() - SpiderThread.start() - UserThread.join() - SpiderThread.join() - - -def main(): - user_path = 'username.txt' - csdn = CSDN('cookie.txt') - csdn.run(user_path) - - -if __name__ == "__main__": - main() - diff --git a/requirements.txt b/requirements.txt index 861958b..ea072a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ bs4==0.0.1 -selenium==3.141.0 requests==2.22.0 \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..48899d0 --- /dev/null +++ b/test.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +# coding: utf-8 + +if __name__ == "__main__": + import CSDN + CSDN.run("ds19991999") \ No newline at end of file diff --git a/username.txt b/username.txt deleted file mode 100644 index adb7063..0000000 --- a/username.txt +++ /dev/null @@ -1 +0,0 @@ -ds19991999 \ No newline at end of file