htmlparse

master
ZouJiu 2023-07-23 13:32:37 +08:00
parent 032bf59d82
commit 599f5c2f0a
1 changed files with 25 additions and 10 deletions

View File

@ -306,12 +306,16 @@ def cleartxt(kkk):
kkk = kkk.replace("\n", "") kkk = kkk.replace("\n", "")
return kkk return kkk
def recursion(nod, article, number, driver, dircrea): def recursion(nod, article, number, driver, dircrea, bk=False):
if isinstance(nod, dict): if isinstance(nod, dict):
if 'nodeName' in nod.keys() and nod['nodeName']=='#text': if 'nodeName' in nod.keys() and nod['nodeName']=='#text':
kkk = cleartxt(nod['textContent']) kkk = cleartxt(nod['textContent'])
if len(kkk) > 0: if len(kkk) > 0:
if bk:
article += "**"
article += nod['textContent'] article += nod['textContent']
if bk:
article += "**"
return article, number return article, number
elif isinstance(nod, webdriver.remote.webelement.WebElement): elif isinstance(nod, webdriver.remote.webelement.WebElement):
@ -322,7 +326,7 @@ def recursion(nod, article, number, driver, dircrea):
article += "\n" + '#' * int(tag_name[-1]) + ' ' article += "\n" + '#' * int(tag_name[-1]) + ' '
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod) p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes: for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea) article, number = recursion(pnode, article, number, driver, dircrea, bk)
article += '\n' article += '\n'
elif tag_name=="span": elif tag_name=="span":
datatex = nod.get_attribute("data-tex") datatex = nod.get_attribute("data-tex")
@ -336,11 +340,15 @@ def recursion(nod, article, number, driver, dircrea):
imgchunk = nod.find_elements(By.TAG_NAME, 'img') imgchunk = nod.find_elements(By.TAG_NAME, 'img')
achunk = nod.find_elements(By.TAG_NAME, 'a') achunk = nod.find_elements(By.TAG_NAME, 'a')
if len(imgchunk)==0 and len(achunk)==0: if len(imgchunk)==0 and len(achunk)==0:
if bk:
article += "**"
article += nod.text article += nod.text
if bk:
article += "**"
else: else:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod) p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes: for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea) article, number = recursion(pnode, article, number, driver, dircrea, bk)
# else: # else:
# formula_span = nod.find_elements(By.CLASS_NAME, "ztext-math") # formula_span = nod.find_elements(By.CLASS_NAME, "ztext-math")
# for jf in range(len(formula_span)): # for jf in range(len(formula_span)):
@ -355,22 +363,29 @@ def recursion(nod, article, number, driver, dircrea):
else: else:
article += "["+nod.text+"]"+"("+linksite + ")" article += "["+nod.text+"]"+"("+linksite + ")"
elif tag_name=="b" or tag_name=="strong": elif tag_name=="b" or tag_name=="strong":
txt = nod.text p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
while len(txt) > 0 and txt[-1] == " ": for pnode in p_childNodes:
txt = txt[:-1] article, number = recursion(pnode, article, number, driver, dircrea, True)
article += " **" + txt + "** " # txt = nod.text
# while len(txt) > 0 and txt[-1] == " ":
# txt = txt[:-1]
# article += " **" + txt + "** "
elif tag_name=="em": elif tag_name=="em":
if bk:
article += "**"
article += nod.text article += nod.text
if bk:
article += "**"
# elif tag_name=='td': # elif tag_name=='td':
# article += nod.text # article += nod.text
elif tag_name in ['table', 'tbody', 'tr', 'td', 'u']: elif tag_name in ['table', 'tbody', 'tr', 'td', 'u']:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod) p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes: for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea) article, number = recursion(pnode, article, number, driver, dircrea, bk)
elif tag_name=='p': elif tag_name=='p':
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod) p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes: for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea) article, number = recursion(pnode, article, number, driver, dircrea, bk)
article += "\n" article += "\n"
elif tag_name=="div": elif tag_name=="div":
# atags = nod.find_elements(By.TAG_NAME, 'a') # atags = nod.find_elements(By.TAG_NAME, 'a')
@ -381,7 +396,7 @@ def recursion(nod, article, number, driver, dircrea):
else: else:
p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod) p_childNodes = driver.execute_script("return arguments[0].childNodes;", nod)
for pnode in p_childNodes: for pnode in p_childNodes:
article, number = recursion(pnode, article, number, driver, dircrea) article, number = recursion(pnode, article, number, driver, dircrea, bk)
elif tag_name=="figure": elif tag_name=="figure":
imgchunk = nod.find_elements(By.TAG_NAME, 'img') imgchunk = nod.find_elements(By.TAG_NAME, 'img')
for i in range(len(imgchunk)): for i in range(len(imgchunk)):