import requests from bs4 import BeautifulSoup resp = requests.get('https://book.douban.com/top250?start=0') soup = BeautifulSoup(resp.text, 'lxml') # Request for HTML Function of source code def get_html(url): # Masquerading as a browser access headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} resp = requests.get(url, headers=headers).text return resp # Get functions for all pages def all_page(): base_url = 'https://book.douban.com/top250?start=' urllist = [] # from 0 To 225, interval 25 Array of for page in range(0, 250, 25): allurl = base_url + str(page) urllist.append(allurl) return urllist # Parsing the page , Get data information def html_parse(): # Call function ,for Loop through all pages for url in all_page(): # BeautifulSoup Parsing soup = BeautifulSoup(get_html(url), 'lxml') # Title alldiv = soup.find_all('div', class_='pl2') names = [a.find('a')['title'] for a in alldiv] # author allp = soup.find_all('p', class_='pl') authors = [p.get_text() for p in allp] # score starspan = soup.find_all('span', class_='rating_nums') scores = [s.get_text() for s in starspan] # brief introduction sumspan = soup.find_all('span', class_='inq') sums = [i.get_text() for i in sumspan] for name, author, score, sum in zip(names, authors, scores, sums): name = ' Title :' + str(name) + '\n' author = ' author :' + str(author) + '\n' score = ' score :' + str(score) + '\n' sum = ' brief introduction :' + str(sum) + '\n' data = name + author + score + sum # Save the data f.writelines(data + '=======================' + '\n') # file name filename = ' Douban books Top250.txt' # Save file operation f = open(filename, 'w', encoding='utf-8') # Call function html_parse() f.close() print(' Saved successfully .') # find_all() Method , # Be careful class yes Python key word , Then underline it _: # alldiv = soup.find_all('div', class_='pl2') # for a in alldiv: # names = a.find('a')['title'] # print('find_all():', names) # find() Method : # alldiv2 = soup.find('div', class_='pl2') # names2 = alldiv2.find('a')['title'] # print('find():', names2 )
Xiaobai's crawler road , The first water test .
Participation of this paper Tencent cloud media sharing plan , You are welcome to join us , share .