import re
import json
import aiohttp
import asyncio
import time
import pymysql
from asyncio.locks import Semaphore
from functools import partial headers = {
'Cookie': 'auth_token=your_token_here',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
} def save_data(cursor, addr_dic, obj):
data = obj.result()['data']
name = data['iname']
addr = addr_dic.get(name, '')
idcard = data['cardnum']
assert re.match('\d{10}[\d*]{4}\d{3}[\dxX]', idcard)
birth = idcard[6:10]
assert birth.isdigit()
birth += ' year '
sex = data.get('sex')
if not sex:
n = int(idcard[-2])
sex = ' male ' if (n % 2) == 1 else ' Woman '
tm = time.localtime(data.get('regdate', 0) / 1000)
createtime = f'{tm.tm_year}-{tm.tm_mon}-{tm.tm_mday}'
cursor.execute("insert into tianyancha(name, birth, sex, idcard, court, createtime, caseno, base, duty, status, detail, addr) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (
name, birth, sex, idcard, data['courtname'], createtime, data['casecode'], data['gistunit'], data['duty'], data['performance'], data['disrupttypename'], addr
except Exception as e:
print(' Insert error ', e.args) async def parse_case_data(sem, session, cid):
# Crawl the details record
async with sem: # Control concurrency
async with session.get(f"{cid}") as rsp:
return await rsp.json() async def parse_province(sem, session, cursor, url):
page = 1
while True:
# Turn the page and crawl
page_url = f'{url}/p{page}'
async with session.get(page_url) as rsp:
txt = await rsp.text()
# Resolve the address corresponding to the person's name
addr_dic = {}
pps = [i.strip() for i in re.findall('dishonest_base_info_detail">(.*?)</', txt, re.S)]
for itm in pps:
name, _, _, addr = itm.split(',')
assert addr.endswith(' people .')
addr = addr.rstrip(' people .')
addr_dic[name] = addr
pass # Analyze each record of dishonesty id
cid_lis = re.findall('data-id="([\da-z]{32})"', txt)
tasks = []
for cid in cid_lis:
# Enable the coroutine to crawl and parse each record
task = asyncio.create_task(parse_case_data(sem, session, cid))
# Call back to mysql
task.add_done_callback(partial(save_data, cursor, addr_dic))
await asyncio.wait(tasks)
print(f' The first {page} Page crawling complete ')
if 'tic-icon-arrow-right' not in txt:
page += 1
print(f' Climb to the second {page} Page failed ')
break async def main():
province = " guangdong "
url_data = json.load(open('url.json', 'r', encoding='utf-8')) # url.json: Store the corresponding url Of json file
url_lis = [url_data.get(province)] # This is for all provinces to crawl together , But I only crawl from Guangdong
sem = Semaphore(4)
conn = pymysql.connect(host='localhost', port=3306, user='user', password='password', charset='utf8', database='db', autocommit=True)
cursor = conn.cursor()
async with aiohttp.ClientSession(headers=headers) as session:
for url in url_lis:
await parse_province(sem, session, cursor, url)
conn.close() if __name__ == '__main__':

