简介:本文详细解析如何利用Python爬虫技术自动化查询国家企业信用信息公示系统,重点突破极验Geetest验证码,提供完整技术方案与合规建议。
Geetest 4.0采用行为轨迹分析,传统破解方式失效。需通过模拟真实用户操作(鼠标移动轨迹、点击间隔)完成验证,或使用第三方打码平台集成。
# 基础依赖pip install requests selenium pyppeteer pillow numpy opencv-python
from selenium.webdriver import ActionChainsimport timeimport randomdef simulate_human_behavior(driver):# 模拟鼠标随机移动轨迹start_x, start_y = 100, 100end_x, end_y = 300, 300steps = 20for i in range(1, steps):x = start_x + (end_x - start_x) * i / stepsy = start_y + (end_y - start_y) * i / steps# 添加随机偏移(±5像素)x += random.randint(-5, 5)y += random.randint(-5, 5)ActionChains(driver)\.move_by_offset(x, y)\.pause(random.uniform(0.05, 0.2))\.perform()# 模拟点击ActionChains(driver).click().perform()time.sleep(random.uniform(1, 2))
import base64import requestsdef solve_geetest_via_api(screenshot_path):with open(screenshot_path, 'rb') as f:img_data = base64.b64encode(f.read()).decode()payload = {'image': img_data,'type': 'geetest4'}response = requests.post('https://api.captcha-solver.com/solve', json=payload)return response.json().get('solution')
import requestsfrom Crypto.Cipher import AESimport base64import jsonclass GsxtCrawler:def __init__(self):self.session = requests.Session()self.session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)','Referer': 'http://www.gsxt.gov.cn'})def _decrypt_response(self, encrypted_data):# 示例AES解密逻辑(实际密钥需逆向分析)key = b'16byte_aes_key'iv = b'16byte_aes_iv'cipher = AES.new(key, AES.MODE_CBC, iv)decrypted = cipher.decrypt(base64.b64decode(encrypted_data))return json.loads(decrypted.split(b'\x00')[0].decode())def search_enterprise(self, keyword):# 1. 先通过验证码挑战获取tokenchallenge_token = self._pass_geetest()# 2. 构造加密请求参数encrypted_params = self._encrypt_search_params(keyword)# 3. 发送请求url = 'http://www.gsxt.gov.cn/api/search'data = {'token': challenge_token,'params': encrypted_params}response = self.session.post(url, data=data)return self._decrypt_response(response.text)
通过分析前端JS代码(通常在/static/js/encrypt.js),可定位加密逻辑:
_timestamp需同步服务器时间
from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsdef init_browser():options = Options()options.add_argument('--disable-blink-features=AutomationControlled')options.add_experimental_option('excludeSwitches', ['enable-automation'])driver = webdriver.Chrome(options=options)driver.get('http://www.gsxt.gov.cn')return driver
def main():# 浏览器自动化处理验证码driver = init_browser()try:# 等待验证码加载time.sleep(5)# 执行行为模拟(需根据实际DOM调整)geetest_iframe = driver.find_element_by_tag_name('iframe')driver.switch_to.frame(geetest_iframe)simulate_human_behavior(driver)# 获取验证通过后的token# (实际需解析DOM获取token)token = driver.execute_script("return window.geetest_token")# 切换回主爬虫会话driver.switch_to.default_content()# 初始化API爬虫crawler = GsxtCrawler()crawler.session.cookies.update(driver.get_cookies())# 执行搜索results = crawler.search_enterprise('阿里巴巴')print(json.dumps(results, indent=2))finally:driver.quit()
数据使用限制:
反爬应对策略:
法律风险提示:
性能提升:
稳定性增强:
功能扩展:
本方案通过结合浏览器自动化与API逆向技术,实现了国家企业信用信息的高效获取。实际开发中需注意:
对于非技术用户,建议通过正规数据服务商(如天眼查、企查查)获取数据,其已解决验证码和反爬问题,并提供更稳定的服务。技术开发者可将本项目作为反反爬虫技术的研究案例,但需严格遵守法律法规。