This blog continues to learn BeautifulSoup
, Target site selection “ Liyang photography circle ”, This local forum .
The paging rules of the target site collected this time are as follows :
http://www.jsly001.com/thread-htm-fid-45-page-{ Page number }.html
The code adopts multithreading threading modular +requests modular +BeautifulSoup Module writing .
Take rules according to the list page → Details page .
This case is a practical case ,bs4 Relevant knowledge points have been paved in the last blog , Gu first shows the complete code , Then based on comments and key functions .
import random
import threading
import logging
from bs4 import BeautifulSoup
import requests
import lxml
logging.basicConfig(level=logging.NOTSET) # Set the log output level
# Make a statement LiYang class , It is inherited from threading.Thread
class LiYangThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self) # Instantiate multithreaded objects
self._headers = self._get_headers() # Random access ua
self._timeout = 5 # Set timeout
# Each thread goes to get global resources
def run(self):
# while True: # This is the multithreading on position
try:
res = requests.get(url="http://www.jsly001.com/thread-htm-fid-45-page-1.html", headers=self._headers,
timeout=self._timeout) # Test to get the first page of data
except Exception as e:
logging.error(e)
if res is not None:
html_text = res.text
self._format_html(html_text) # call html analytic function
def _format_html(self, html):
# Use lxml To analyze
soup = BeautifulSoup(html, 'lxml')
# Get the section theme segmentation area , Mainly to prevent getting the top theme
part_tr = soup.find(attrs={
'class': 'bbs_tr4'})
if part_tr is not None:
items = part_tr.find_all_next(attrs={
"name": "readlink"}) # Get the details page address
else:
items = soup.find_all(attrs={
"name": "readlink"})
# Parse out the title and data
data = [(item.text, f'http://www.jsly001.com/{
item["href"]}') for item in items]
# Enter the inner page of the title
for name, url in data:
self._get_imgs(name, url)
def _get_imgs(self, name, url):
""" Resolve picture address """
try:
res = requests.get(url=url, headers=self._headers, timeout=self._timeout)
except Exception as e:
logging.error(e)
# Picture extraction logic
if res is not None:
soup = BeautifulSoup(res.text, 'lxml')
origin_div1 = soup.find(attrs={
'class': 'tpc_content'})
origin_div2 = soup.find(attrs={
'class': 'imgList'})
content = origin_div2 if origin_div2 else origin_div1
if content is not None:
imgs = content.find_all('img')
# print([img.get("src") for img in imgs])
self._save_img(name, imgs) # Save the picture
def _save_img(self, name, imgs):
""" Save the picture """
for img in imgs:
url = img.get("src")
if url.find('http') < 0:
continue
# Find... In the parent tag id attribute
id_ = img.find_parent('span').get("id")
try:
res = requests.get(url=url, headers=self._headers, timeout=self._timeout)
except Exception as e:
logging.error(e)
if res is not None:
name = name.replace("/", "_")
with open(f'./imgs/{
name}_{
id_}.jpg', "wb+") as f: # Pay attention to python The runtime directory is created in advance imgs Folder
f.write(res.content)
def _get_headers(self):
uas = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
]
ua = random.choice(uas)
headers = {
"user-agent": ua
}
return headers
if __name__ == '__main__':
my_thread = LiYangThread()
my_thread.run()
This case adopts ,BeautifulSoup Module adoption lxml Parser
Yes HTML Data analysis , This parser will be used in the future , Pay attention to import... Before use lxml
modular .
The data extraction part adopts soup.find()
And soup.find_all()
Two functions do , The code also uses find_parent()
function , Used to collect data from parent tags id
attribute .
# Find... In the parent tag id attribute
id_ = img.find_parent('span').get("id")
When the code is running DEBUG Information , control logging
Log output level .
Code warehouse address :https://codechina.csdn.net/hihell/python120, Give attention or Star Well .
This blog is bs4 Application articles , If necessary, , Please expand your study repeatedly .
Today is the first day of continuous writing 239 / 365 God .
expect Focus on , give the thumbs-up 、 Comment on 、 Collection .
More exciting
《 Reptiles 100 example , Column sales , After buying, you can learn a series of columns 》