import re
import json
import aiohttp
import asyncio
import time
import pymysql
from asyncio.locks import Semaphore
from functools import partial headers = {
'Cookie': 'auth_token=your_token_here',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
} def save_data(cursor, addr_dic, obj):
try:
data = obj.result()['data']
name = data['iname']
addr = addr_dic.get(name, '')
idcard = data['cardnum']
assert re.match('\d{10}[\d*]{4}\d{3}[\dxX]', idcard)
birth = idcard[6:10]
assert birth.isdigit()
birth += ' year '
sex = data.get('sex')
if not sex:
n = int(idcard[-2])
sex = ' male ' if (n % 2) == 1 else ' Woman '
tm = time.localtime(data.get('regdate', 0) / 1000)
createtime = f'{tm.tm_year}-{tm.tm_mon}-{tm.tm_mday}'
cursor.execute("insert into tianyancha(name, birth, sex, idcard, court, createtime, caseno, base, duty, status, detail, addr) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
name, birth, sex, idcard, data['courtname'], createtime, data['casecode'], data['gistunit'], data['duty'], data['performance'], data['disrupttypename'], addr
))
except Exception as e:
print(' Insert error ', e.args) async def parse_case_data(sem, session, cid):
# Crawl the details record
async with sem: # Control concurrency
async with session.get(f"https://shixin.tianyancha.com/shixin/getDishonestinfoDetailWeb.json?bussinessId={cid}") as rsp:
return await rsp.json() async def parse_province(sem, session, cursor, url):
page = 1
while True:
# Turn the page and crawl
page_url = f'{url}/p{page}'
async with session.get(page_url) as rsp:
try:
txt = await rsp.text()
# Resolve the address corresponding to the person's name
addr_dic = {}
pps = [i.strip() for i in re.findall('dishonest_base_info_detail">(.*?)</', txt, re.S)]
for itm in pps:
try:
name, _, _, addr = itm.split(',')
assert addr.endswith(' people .')
addr = addr.rstrip(' people .')
addr_dic[name] = addr
except:
pass # Analyze each record of dishonesty id
cid_lis = re.findall('data-id="([\da-z]{32})"', txt)
tasks = []
for cid in cid_lis:
# Enable the coroutine to crawl and parse each record
task = asyncio.create_task(parse_case_data(sem, session, cid))
# Call back to mysql
task.add_done_callback(partial(save_data, cursor, addr_dic))
tasks.append(task)
await asyncio.wait(tasks)
print(f' The first {page} Page crawling complete ')
if 'tic-icon-arrow-right' not in txt:
break
page += 1
except:
print(f' Climb to the second {page} Page failed ')
break async def main():
province = " guangdong "
url_data = json.load(open('url.json', 'r', encoding='utf-8')) # url.json: Store the corresponding url Of json file
url_lis = [url_data.get(province)] # This is for all provinces to crawl together , But I only crawl from Guangdong
sem = Semaphore(4)
conn = pymysql.connect(host='localhost', port=3306, user='user', password='password', charset='utf8', database='db', autocommit=True)
cursor = conn.cursor()
async with aiohttp.ClientSession(headers=headers) as session:
for url in url_lis:
await parse_province(sem, session, cursor, url)
cursor.close()
conn.close() if __name__ == '__main__':
asyncio.run(main())

python Xiecheng crawls more articles about Lao Lai's data from a website

  1. Python Xie Cheng crawls for the girl picture ( Internal welfare , Do you know ~)

    Project description : 1. Project introduction   This project uses Python The program provided +scrapy The use of selectors in ( Quite easy to use ) The realization crawls the younger sister graph ( Welfare map ) picture , This has learned , Some durian or something .pow(2, 10) Is that so? ! 2. Knowledge used ...

  2. python It's easy to crawl information from a website

    requests Library is a brief and simple process HTTP Third party Library of requests get() It's the most common way to get web pages , Its basic usage is as follows Use requests Library access HTML Page and convert it to a string , Further analysis is needed HTML ...

  3. First attempt python Reptiles , Crawling novels from fiction websites .

    This time, it's Xiao a Peng , First pass python Crawler to climb a novel website novel . Serve directly below . 1. First I need to import the corresponding package , Here I use the third-party module shelf package ,requests.requests yes python It's easy to implement ...

  4. 04 Python Web crawler &lt;&lt; Crawling get/post Requested page data &gt;&gt; And requests modular

    One . urllib library urllib yes Python It comes with a library for crawlers , Its main function is to send requests through the code simulation browser . Its commonly used sub modules are in Python3 For in the urllib.request and urllib ...

  5. python Crawler learning - Crawl all the pictures on a website

    Recently, I took a brief look at python Crawler video . Then I tried to write down the crawler operation , The plan is to climb down all the beauties on a website , But by calculation , There are hundreds of them G The appearance of , Forget it . Just download a little bit first and have a look . This crawler uses p ...

  6. Python Reptiles : Crawling a website keywords corresponding to goods ID, And deposit DB2 database

    The company suck up research and development , I wrote one myself , Products corresponding to special keywords ID. And I learned to use Python operation DB2 database .Python Send E-mail . Write a log file . Handle browser access restrictions . #!/usr/bin/python# ...

  7. use python Realize the method of multi thread crawling all videos of film and television website 【 note 】

    I'll take this site as an example :https://91mjw.com/   Other site methods are similar . First step : Get all the video connections of the whole station html = requests.get("https://91mjw ...

  8. python Reptiles -- Crawl movie information from a website and write mysql database

    The book follows , At the end of the article, I mentioned that the crawled movie information will be written into the database , To make it easy to see , Today, it will be realized . First of all, code : # -*- coding:utf-8 -*- import requests import re im ...

  9. python Reptiles -- Crawl the movie download address of a website

    Preface : Because I'm still python A pupil in the world , There is still a long way to go , So the purpose of this paper is to guide , To achieve the goal , For those principles that I don't understand myself , Don't explain too much , So as not to mislead others , You can search online . Friendship tips : This code uses ...

  10. python Reptiles - Basic introduction - Crawl the entire website 《3》

    python Reptiles - Basic introduction - Crawl the entire website <3> describe : The first two chapters give a rough account of python2.python3 Crawl the entire website , This chapter simply records python2.python3 The difference between python ...

Random recommendation

  1. fidder Catch https Package configuration method (ios &amp; android &amp; pc browser )

    1. fidder Catch https The basic configuration of the package , See the following blog post http://blog.csdn.net/idlear/article/details/50999490 2. Have a problem : The only thing to see is Tunn ...

  2. Oracle database External component has thrown an exception

    This kind of mistake usually occurs in SQL An error reported before a statement is executed . Such mistakes usually require careful examination SQL sentence , And the data type of the parameter . And in the cmd.ExecuteNonQuery() Mistakes in , It is very likely that the syntax class errors are eliminated . ...

  3. percona-toolkit Tool check MySQL Replication consistency and repair

    utilize percona-toolkit Tool check MySQL Consistency of master-slave replication data in database , And repair . One .             pt-table-checksum Check the consistency of master-slave data pt-table-c ...

  4. HDU-4920 Matrix multiplication

    matrix multiplication , Use one line to visit , Shorter access time than using one column , According to the fact that the array is stored on one line . Magic little code . Matrix multiplication Time Limit: 4000/2000 MS (Java/Ot ...

  5. Android onConfigurationChanged The role of

    API The original text says : android:configChangesLists configuration changes that the activity will handle itself. When ...

  6. CocoaAsyncSocket + Protobuf Deal with sticking and unpacking

    In the last article <iOS And ProtocolBuffer Construction and examples demo> The construction of sharing environment , We're talking to the server IM Communication uses github The famous framework CocoaAsynSocket, And then with the server ...

  7. ActiveMQ The learning : Simple implementation of connection pool and application of template pattern

    One . install activemq Download address :https://archive.apache.org/dist/activemq/5.13.0/apache-activemq-5.13.0-bin.zip Download the ...

  8. 《java.util.concurrent Package source code reading 》05 BlockingQueue

    Everyone must be familiar with the producers - Consumer queue , The producer is responsible for adding elements to the queue , If the queue is full, it will be blocked until a consumer takes the element . contrary , The consumer is responsible for taking elements from the queue , If the queue is empty, it will be blocked until a producer adds an element to the queue ...

  9. [linux] C Language Linux System programming -TCP communication 11 States

    Three handshakes by client Send out voluntarily SYN request , here client be in SYN_SENT state ( The first handshake ) When server It will be received by LISTEN Turn into SYN_REVD state , And the reply client, client ...

  10. SSH E-Shop --- User activation

    In the previous blog , Xiaobian mainly combines SSH The online mall project , This paper briefly introduces how to realize the function of mail sending , The mail was sent , The next step is to activate , Why? ? Now most websites have to activate their accounts , Then you can register successfully , This is to prevent malignant injection ...