The use of parsing libraries is similar .
bs4 The biggest advantage is that the analytical formula is concise , Simple extraction . The disadvantage is that the extracted text needs to be reprocessed . Unlike re and lxml What you need directly can be very concise without redundancy to extract the required text .
The elder brother wrote the specific usage in great detail
For official documents bs4 The usage of the library is explained in detail . It can be said that the summary is very comprehensive .
class TiebaSpider(object):
def __init__(self):
self.url=''
def get_html(self,url):
res=requests.get(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'})
html=res.text
return html
def parse_html(self,html):
parse_html = BeautifulSoup(html, 'html.parser')
text = parse_html.select('#content p')
return text
def save_html(self,filename,html):
with open('D:/request/'+filename,'w') as f:
for i in html:
j = str(i)
j = j[3:-4]
j=j+'\n'
f.write(j)
Part of the source code