update shell

2019-10-24 21:03:51 +08:00 · 2019-10-24 21:03:51 +08:00 · 521ae25c1b
parent f8a47d20c1
commit 521ae25c1b
14 changed files with 209 additions and 261 deletions
--- a/CSDN/init.py
+++ b/CSDN/init.py
@ -0,0 +1,2 @@
 from .csdn import run
 from .csdn import CSDN
--- a/CSDN/csdn.py
+++ b/CSDN/csdn.py
@ -0,0 +1,186 @@
 #!/usr/bin/env python
 # coding: utf-8
 import os, time, re
 import requests
 import threading
 from bs4 import BeautifulSoup, Comment
 from .tomd import Tomd
 def result_file(folder_name, file_name, article_name):
 	folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../"+article_name, folder_name)
 	if not os.path.exists(folder):
 		os.makedirs(folder)
 		path = os.path.join(folder, file_name)
 		file = open(path,"w")
 		file.close()
 	else:
 		path = os.path.join(folder, file_name)
 	return path
 def delete_ele(soup:BeautifulSoup, tags:list):
 	for ele in tags:
 		for useless_tag in soup.select(ele):
 			useless_tag.decompose()
 def delete_ele_attr(soup:BeautifulSoup, attrs:list):
 	for attr in attrs:
 		for useless_attr in soup.find_all():
 			del useless_attr[attr]
 def delete_blank_ele(soup:BeautifulSoup, eles_except:list):
 	for useless_attr in soup.find_all():
 		try:
 			if useless_attr.name not in eles_except and useless_attr.text == "":
 				useless_attr.decompose()
 		except Exception:
 			pass
 class TaskQueue(object):
 	def __init__(self):
 		self.VisitedList = []
 		self.UnVisitedList = []
 	def getVisitedList(self):
 		return self.VisitedList
 	def getUnVisitedList(self):
 		return self.UnVisitedList
 	def InsertVisitedList(self, url):
 		if url not in self.VisitedList:
 			self.VisitedList.append(url)
 	def InsertUnVisitedList(self, url):
 		if url not in self.UnVisitedList:
 			self.UnVisitedList.append(url)
 	def RemoveVisitedList(self, url):
 		self.VisitedList.remove(url)
 	def PopUnVisitedList(self,index=0):
 		url = []
 		if index and self.UnVisitedList:
 			url = self.UnVisitedList[index]
 			del self.UnVisitedList[:index]
 		elif self.UnVisitedList:
 			url = self.UnVisitedList.pop()
 		return url
 	def getUnVisitedListLength(self):
 		return len(self.UnVisitedList)
 class CSDN(object):
 	def __init__(self, username, article__folder_name):
 		self.headers = {
 			"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
 		}
 		self.username = username
 		self.TaskQueue = TaskQueue()
 		self.article__folder_name = article__folder_name
 		self.url_num = 1
 	def start(self):
 		"""获取文章标题和链接"""
 		num = 0
 		while True:
 			num += 1
 			url = u'https://blog.csdn.net/' + self.username + '/article/list/' + str(num)
 			response = requests.get(url=url, headers=self.headers)
 			html = response.text
 			soup = BeautifulSoup(html, "html.parser")
 			articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
 			if len(articles) > 0:
 				for article in articles:
 					article_title = article.a.text.strip().replace('        ',': ')
 					article_href = article.a['href']
 					self.TaskQueue.InsertUnVisitedList([article_title, article_href])
 			else:
 				break
 	def get_md(self, url):
 		"""爬取文章"""
 		response = requests.get(url=url, headers=self.headers)
 		html = response.text
 		soup = BeautifulSoup(html, 'lxml')
 		content = soup.select_one("#content_views")
 		# 删除注释
 		for useless_tag in content(text=lambda text: isinstance(text, Comment)):
 			useless_tag.extract()
 		# 删除无用标签
 		tags = ["svg", "ul", ".hljs-button.signin"]
 		delete_ele(content, tags)
 		# 删除标签属性
 		attrs = ["class", "name", "id", "onclick", "style", "data-token", "rel"]
 		delete_ele_attr(content,attrs)
 		# 删除空白标签
 		eles_except = ["img", "br", "hr"]
 		delete_blank_ele(content, eles_except)
 		# 转换为markdown
 		md = Tomd(str(content)).markdown
 		return md
 	def write_readme(self):
 		"""生成readme"""
 		print("[++] 正在爬取 {} 的博文 ......".format(self.username))
 		reademe_path = result_file(self.username,file_name="README.md",article_name=self.article__folder_name)
 		with open(reademe_path,'w', encoding='utf-8') as reademe_file:
 			readme_head = "# " + self.username + " 的博文\n"
 			reademe_file.write(readme_head)
 			for [article_title,article_href] in self.TaskQueue.UnVisitedList[::-1]:
 					text = str(self.url_num) + '. [' + article_title + ']('+ article_href +')\n'
 					reademe_file.write(text)
 					self.url_num += 1
 		self.url_num = 1
 	def spider(self):
 		"""爬取所有文章"""
 		try:
 			while True:
 				[article_title,article_href] = self.TaskQueue.PopUnVisitedList()
 				try:
 					print("[++++] 正在处理URL：{}".format(article_href))
 					file_name = re.sub(r'[\/:：*?"<>|]','-', article_title) + ".md"
 					artical_path = result_file(folder_name=self.username, file_name=file_name, article_name=self.article__folder_name)
 					md_head = "# " + article_title + "\n"
 					md = md_head + self.get_md(article_href)
 					with open(artical_path, "w", encoding="utf-8") as artical_file:
 						artical_file.write(md)
 				except Exception:
 					print("[----] 处理URL异常：{}".format(article_href))
 				self.url_num += 1
 		except Exception:
 			pass
 	def muti_spider(self, thread_num):
 		while True:
 			if self.TaskQueue.getUnVisitedListLength() < 1:
 				break
 			thread_list = []
 			for i in range(thread_num):
 				th = threading.Thread(target=self.spider)
 				thread_list.append(th)
 			for th in thread_list:
 				th.start()
 def run(username: str = "ds19991999", thread_num: int = 10, article__folder_name: str = "articles"):
 	if not os.path.exists(article__folder_name):
 		os.makedirs(article__folder_name)
 	csdn = CSDN(username,article__folder_name)
 	csdn.start()
 	csdn.write_readme()
 	csdn.muti_spider(thread_num)
 if __name__ == "__main__":
 	run("ds19991999", 10, "articles")
--- a/CSDN/tomd.py
+++ b/CSDN/tomd.py
--- a/README.md
+++ b/README.md
@ -1,59 +1,25 @@
-# CSDN 爬虫脚本
+# CSDN 爬虫
-主要功能：爬取 `csdn` 博客指定用户的所有博文并转换为 `markdown` 格式保存到本地。
+> 主要功能：爬取 csdn 博客指定用户的所有博文并转换为 markdown 格式保存到本地。
-## 一、运行环境
+## 下载脚本
-
+```
-需要安装`WebDriver`驱动，https://chromedriver.chromium.org/downloads ，下载与本地对应的`chrome`驱动后，将其添加至环境变量`$PATH`
+git clone https://github.com/ds19991999/csdn-spider.git
-
+cd csdn-spider
 ```shell
 python3
 python3 -m pip install -r requirements.txt
 ```
-## 二、获取脚本
+## 爬取用户全部博文
 ```python
 #!/usr/bin/env python
 # coding: utf-8
-```shell
+if __name__ == "__main__":
-git clone https://github.com/ds19991999/csdn-spider.git
+    import CSDN
    CSDN.run("ds19991999")
 ```
-## 三、用法
+## LICENSE
 ### 1.获取cookie
 登录 `csdn` 账号，进入：https://blog.csdn.net ，按 `F12` 调试网页，复制所有的 `Request Headers`，保存到`cookie.txt`文件中
 ![1571482112632](assets/1571482112632.png)
 ### 2.添加需要爬取的 `csdn` 用户
 在`username.txt`中添加用户名，一行一个
 ### 3.运行脚本
 ```shell
 python3 csdn.py
 ```
 ## 四、效果
 **运行过程**
 ![1571483423256](assets/1571483423256.png)
 **文章列表建立**：`./articles/username/README.md`
 ![1571483552438](assets/1571483552438.png)
 **爬取的博文**：`./articles/username/`
 ![1571483479356](assets/1571483479356.png)
 **博文转换效果**：
 ![1571483777703](assets/1571483777703.png)
 ## 五、LICENSE
 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a>
--- a/assets/1571482112632.png
+++ b/assets/1571482112632.png
--- a/assets/1571483423256.png
+++ b/assets/1571483423256.png
--- a/assets/1571483479356.png
+++ b/assets/1571483479356.png
--- a/assets/1571483552438.png
+++ b/assets/1571483552438.png
--- a/assets/1571483777703.png
+++ b/assets/1571483777703.png
--- a/cookie.txt
+++ b/cookie.txt
--- a/csdn.py
+++ b/csdn.py
@ -1,210 +0,0 @@
 #!/usr/bin/env python
 # coding: utf-8
 import os, time, re
 import requests
 import threading
 from bs4 import BeautifulSoup, Comment
 from selenium import webdriver
 from tomd import Tomd
 def result_file(folder_name, file_name):
 	folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "articles", folder_name)
 	if not os.path.exists(folder):
 		os.makedirs(folder)
 		path = os.path.join(folder, file_name)
 		file = open(path,"w")
 		file.close()
 	else:
 		path = os.path.join(folder, file_name)
 	return path
 def get_headers(cookie_path:str):
 	cookies = {}
 	with open(cookie_path, "r", encoding="utf-8") as f:
 		cookie_list = f.readlines()
 	for line in cookie_list:
 		cookie = line.split(":")
 		cookies[cookie[0]] = str(cookie[1]).strip()
 	return cookies
 def delete_ele(soup:BeautifulSoup, tags:list):
 	for ele in tags:
 		for useless_tag in soup.select(ele):
 			useless_tag.decompose()
 def delete_ele_attr(soup:BeautifulSoup, attrs:list):
 	for attr in attrs:
 		for useless_attr in soup.find_all():
 			del useless_attr[attr]
 def delete_blank_ele(soup:BeautifulSoup, eles_except:list):
 	for useless_attr in soup.find_all():
 		try:
 			if useless_attr.name not in eles_except and useless_attr.text == "":
 				useless_attr.decompose()
 		except Exception:
 			pass
 class TaskQueue(object):
 	def __init__(self):
 		self.VisitedList = []
 		self.UnVisitedList = []
 	def getVisitedList(self):
 		return self.VisitedList
 	def getUnVisitedList(self):
 		return self.UnVisitedList
 	def InsertVisitedList(self, url):
 		if url not in self.VisitedList:
 			self.VisitedList.append(url)
 	def InsertUnVisitedList(self, url):
 		if url not in self.UnVisitedList:
 			self.UnVisitedList.append(url)
 	def RemoveVisitedList(self, url):
 		self.VisitedList.remove(url)
 	def PopUnVisitedList(self,index=0):
 		url = ""
 		if index and self.UnVisitedList:
 			url = self.UnVisitedList[index]
 			del self.UnVisitedList[:index]
 		elif self.UnVisitedList:
 			url = self.UnVisitedList.pop()
 		return url
 	def getUnVisitedListLength(self):
 		return len(self.UnVisitedList)
 class Article(object):
 	def __init__(self):
 		self.options = webdriver.ChromeOptions()
 		self.options.add_experimental_option('excludeSwitches', ['enable-logging'])
 		self.options.add_argument('headless')
 		self.browser = webdriver.Chrome(options=self.options)
 		# 设置全局智能等待时间
 		self.browser.implicitly_wait(30)
 	def get_content(self, url):
 		self.browser.get(url)
 		try:
 			self.browser.find_element_by_xpath('//a[@class="btn-readmore"]').click()
 		except Exception:
 			pass
 		content = self.browser.find_element_by_xpath('//div[@id="content_views"]').get_attribute("innerHTML")
 		return content
 	def get_md(self, url):
 		"""
 		转换为markdown格式
 		"""
 		content = self.get_content(url)
 		soup = BeautifulSoup(content, 'lxml')
 		# 删除注释
 		for useless_tag in soup(text=lambda text: isinstance(text, Comment)):
 			useless_tag.extract()
 		# 删除无用标签
 		tags = ["svg", "ul", ".hljs-button.signin"]
 		delete_ele(soup, tags)
 		# 删除标签属性
 		attrs = ["class", "name", "id", "onclick", "style", "data-token", "rel"]
 		delete_ele_attr(soup,attrs)
 		# 删除空白标签
 		eles_except = ["img", "br", "hr"]
 		delete_blank_ele(soup, eles_except)
 		# 转换为markdown
 		md = Tomd(str(soup)).markdown
 		return md
 class CSDN(object):
 	def __init__(self, cookie_path):
 		self.headers = get_headers(cookie_path)
 		self.TaskQueue = TaskQueue()
 	def get_articles(self, username:str):
 		"""获取文章标题和链接"""
 		num = 0
 		while True:
 			num += 1
 			url = u'https://blog.csdn.net/' + username + '/article/list/' + str(num)
 			response = requests.get(url=url, headers=self.headers)
 			html = response.text
 			soup = BeautifulSoup(html, "html.parser")
 			articles = soup.find_all('div', attrs={"class":"article-item-box csdn-tracking-statistics"})
 			if len(articles) > 0:
 				for article in articles:
 					article_title = article.a.text.strip().replace('        ','：')
 					article_href = article.a['href']
 					yield article_title,article_href
 			else:
 				break
 	def write_articals(self, username:str):
 		"""将博文写入本地"""
 		print("[++] 正在爬取 {} 的博文 ......".format(username))
 		artical = Article()
 		reademe_path = result_file(username,file_name="README.md")
 		with open(reademe_path,'w', encoding='utf-8') as reademe_file:
 			i = 1
 			readme_head = "# " + username + " 的博文\n"
 			reademe_file.write(readme_head)
 			for article_title,article_href in self.get_articles(username):
 				print("[++++] {}. 正在处理URL：{}".format(str(i), article_href))
 				text = str(i) + '. [' + article_title + ']('+ article_href +')\n'
 				reademe_file.write(text)
 				file_name = str(i) + "." + re.sub(r'[\/:：*?"<>|]','-', article_title) + ".md"
 				artical_path = result_file(folder_name=username, file_name=file_name)
 				try:
 					md_content = artical.get_md(article_href)
 					md_head = "# " + str(i) + "." + article_title + "\n"
 					md = md_head + md_content
 					with open(artical_path, "w", encoding="utf-8") as artical_file:
 						artical_file.write(md)
 				except Exception:
 					print("[----] {}. 处理URL异常：{}".format(str(i), article_href))
 				i += 1
 				# time.sleep(2)
 	def spider(self):
 		"""将爬取到的文章保存到本地"""
 		while True:
 			if self.TaskQueue.getUnVisitedListLength():
 				username = self.TaskQueue.PopUnVisitedList()
 				self.write_articals(username)
 	def check_user(self, user_path:str):
 		with open(user_path, 'r', encoding='utf-8') as f:
 			users = f.readlines()
 		for user in users:
 			self.TaskQueue.InsertUnVisitedList(user.strip())
 	def run(self, user_path):
 		UserThread = threading.Thread(target=self.check_user, args=(user_path,))
 		SpiderThread = threading.Thread(target=self.spider, args=())
 		UserThread.start()
 		SpiderThread.start()
 		UserThread.join()
 		SpiderThread.join()
 def main():
 	user_path = 'username.txt'
 	csdn = CSDN('cookie.txt')
 	csdn.run(user_path)
 if __name__ == "__main__":
 	main()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,2 @@
 bs4==0.0.1
 selenium==3.141.0
 requests==2.22.0
--- a/test.py
+++ b/test.py
@ -0,0 +1,6 @@
 #!/usr/bin/env python
 # coding: utf-8
 if __name__ == "__main__":
    import CSDN
    CSDN.run("ds19991999")
--- a/username.txt
+++ b/username.txt
@ -1 +0,0 @@
 ds19991999
		`@ -0,0 +1,2 @@`
							`from .csdn import run`
							`from .csdn import CSDN`