See the effect :

 Insert picture description here

It's no use not talking , Go straight to the code :

# author   : sunzd# date     : 2019/9/01# position : beijingfrom fake_useragent import UserAgentfrom bs4 import BeautifulSoupfrom urllib import requestfrom urllib import errorimport reimport timedef html_request(url):if url is None:returnprint("download html is :{0}".format(url))#  If url Including Chinese , You need to code #  Simulate browser behavior headers = {'UserAgent': str(UserAgent().random)}req = request.Request(url, headers=headers)try:html = request.urlopen(req).read().decode('utf-8')except error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return None# print(html)return htmldef html_parser(url, html):if url is None or html is None:return# pattern = '<main>(.+?)</main>'   # because <main> When it comes next ‘\n’ So you need to ignore the use pattern modifier re.S send '.' Can match any character # articles = re.compile(pattern, re.S).findall(html)# articles = articles[0]pattern_art = '<div class="article-item-box csdn-tracking-statistics" data(.+?)</div>'# print(articles)articles = re.compile(pattern_art, re.S).findall(html.replace('\n', ''))print(articles.__len__())for article in articles:soup = BeautifulSoup(article, 'html.parser')title = soup.find('a', attrs={'target': '_blank'})# print(title)print(" Article title :{0}\n Type of article :{1}".format(title.text.replace(' ', '').replace(" primary ", "").replace(" turn ", ""), title.span.text))print(" The article links :{0}".format(title.attrs['href']))html_request(title.attrs['href'])infors = soup.find('div', attrs={'class': 'info-box d-flex align-content-center'})# for infor in infors.p.next_siblings:   next_siblings :  Because it doesn't include myself , So the first one will be p The node information is removed .# for infor in infors.children:#     if infor == ' ':  # ‘ ’ The space will also be identified as his child , So you need to filter out #         continue#     # print("======{0}".format(infor))#     if infor.span:  #  It only needs <span > Node information #         print("{0}".format(infor.span.text))pattern_next = '<li class="js-page-next js-page-action ui-pager ui-pager-disabled">'next = re.compile(pattern_next).findall(html)# print(html)print(" Last page or not :{0}----{1}".format(len(next), next))if len(next) == 0:return 0else:return 0if __name__ == '__main__':name = ' Your own name 'page = 1url = "https://blog.csdn.net/" + name + "/article/list/" + str(page) + '?'while page < 7:html = html_request(url)# print(html)next = html_parser(url, html)page += 1if page > 6:page = 1url = "https://blog.csdn.net/" + name + "/article/list/" + str(page) + '?'