htmlparse
parent
032bf59d82
commit
599f5c2f0a
35
crawler.py
35
crawler.py
|
@ -306,12 +306,16 @@ def cleartxt(kkk):
|
||||||
kkk = kkk.replace("\n", "")
|
kkk = kkk.replace("\n", "")
|
||||||
return kkk
|
return kkk
|
||||||
|
|
||||||
def recursion(nod, article, number, driver, dircrea):
|
def recursion(nod, article, number, driver, dircrea, bk=False):
|
||||||
if isinstance(nod, dict):
|
if isinstance(nod, dict):
|
||||||
if 'nodeName' in nod.keys() and nod['nodeName']=='#text':
|
if 'nodeName' in nod.keys() and nod['nodeName']=='#text':
|
||||||
kkk = cleartxt(nod['textContent'])
|
kkk = cleartxt(nod['textContent'])
|
||||||
if len(kkk) > 0:
|
if len(kkk) > 0:
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
article += nod['textContent']
|
article += nod['textContent']
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
return article, number
|
return article, number
|
||||||
|
|
||||||
elif isinstance(nod, webdriver.remote.webelement.WebElement):
|
elif isinstance(nod, webdriver.remote.webelement.WebElement):
|
||||||
|
@ -322,7 +326,7 @@ def recursion(nod, article, number, driver, dircrea):
|
||||||
article += "\n" + '#' * int(tag_name[-1]) + ' '
|
article += "\n" + '#' * int(tag_name[-1]) + ' '
|
||||||
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
||||||
for pnode in p_childNodes:
|
for pnode in p_childNodes:
|
||||||
article, number = recursion(pnode, article, number, driver, dircrea)
|
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
||||||
article += '\n'
|
article += '\n'
|
||||||
elif tag_name=="span":
|
elif tag_name=="span":
|
||||||
datatex = nod.get_attribute("data-tex")
|
datatex = nod.get_attribute("data-tex")
|
||||||
|
@ -336,11 +340,15 @@ def recursion(nod, article, number, driver, dircrea):
|
||||||
imgchunk = nod.find_elements(By.TAG_NAME, 'img')
|
imgchunk = nod.find_elements(By.TAG_NAME, 'img')
|
||||||
achunk = nod.find_elements(By.TAG_NAME, 'a')
|
achunk = nod.find_elements(By.TAG_NAME, 'a')
|
||||||
if len(imgchunk)==0 and len(achunk)==0:
|
if len(imgchunk)==0 and len(achunk)==0:
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
article += nod.text
|
article += nod.text
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
else:
|
else:
|
||||||
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
||||||
for pnode in p_childNodes:
|
for pnode in p_childNodes:
|
||||||
article, number = recursion(pnode, article, number, driver, dircrea)
|
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
||||||
# else:
|
# else:
|
||||||
# formula_span = nod.find_elements(By.CLASS_NAME, "ztext-math")
|
# formula_span = nod.find_elements(By.CLASS_NAME, "ztext-math")
|
||||||
# for jf in range(len(formula_span)):
|
# for jf in range(len(formula_span)):
|
||||||
|
@ -355,22 +363,29 @@ def recursion(nod, article, number, driver, dircrea):
|
||||||
else:
|
else:
|
||||||
article += "["+nod.text+"]"+"("+linksite + ")"
|
article += "["+nod.text+"]"+"("+linksite + ")"
|
||||||
elif tag_name=="b" or tag_name=="strong":
|
elif tag_name=="b" or tag_name=="strong":
|
||||||
txt = nod.text
|
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
||||||
while len(txt) > 0 and txt[-1] == " ":
|
for pnode in p_childNodes:
|
||||||
txt = txt[:-1]
|
article, number = recursion(pnode, article, number, driver, dircrea, True)
|
||||||
article += " **" + txt + "** "
|
# txt = nod.text
|
||||||
|
# while len(txt) > 0 and txt[-1] == " ":
|
||||||
|
# txt = txt[:-1]
|
||||||
|
# article += " **" + txt + "** "
|
||||||
elif tag_name=="em":
|
elif tag_name=="em":
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
article += nod.text
|
article += nod.text
|
||||||
|
if bk:
|
||||||
|
article += "**"
|
||||||
# elif tag_name=='td':
|
# elif tag_name=='td':
|
||||||
# article += nod.text
|
# article += nod.text
|
||||||
elif tag_name in ['table', 'tbody', 'tr', 'td', 'u']:
|
elif tag_name in ['table', 'tbody', 'tr', 'td', 'u']:
|
||||||
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
||||||
for pnode in p_childNodes:
|
for pnode in p_childNodes:
|
||||||
article, number = recursion(pnode, article, number, driver, dircrea)
|
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
||||||
elif tag_name=='p':
|
elif tag_name=='p':
|
||||||
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
||||||
for pnode in p_childNodes:
|
for pnode in p_childNodes:
|
||||||
article, number = recursion(pnode, article, number, driver, dircrea)
|
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
||||||
article += "\n"
|
article += "\n"
|
||||||
elif tag_name=="div":
|
elif tag_name=="div":
|
||||||
# atags = nod.find_elements(By.TAG_NAME, 'a')
|
# atags = nod.find_elements(By.TAG_NAME, 'a')
|
||||||
|
@ -381,7 +396,7 @@ def recursion(nod, article, number, driver, dircrea):
|
||||||
else:
|
else:
|
||||||
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
|
||||||
for pnode in p_childNodes:
|
for pnode in p_childNodes:
|
||||||
article, number = recursion(pnode, article, number, driver, dircrea)
|
article, number = recursion(pnode, article, number, driver, dircrea, bk)
|
||||||
elif tag_name=="figure":
|
elif tag_name=="figure":
|
||||||
imgchunk = nod.find_elements(By.TAG_NAME, 'img')
|
imgchunk = nod.find_elements(By.TAG_NAME, 'img')
|
||||||
for i in range(len(imgchunk)):
|
for i in range(len(imgchunk)):
|
||||||
|
|
Loading…
Reference in New Issue