1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| import time from urllib.parse import unquote import re
# 这里的cookie自己用bp抓取替换就ok了 header={ 'Cookie':'', 'Sec-Ch-Ua':'"Google Chrome";v="107", "Chromium";v="107", "Not=A?Brand";v="24"', 'Sec-Ch-Ua-Mobile':'?0', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', 'Accept':'application/json, text/plain, */*', 'Ymg_ssr':'1668755050538_1668842314729_newx3Oy/bZS3JmKFpuSXF0dFO4LitarO41oOAYMKwim4cWMEziXdEuJoQqC9Po8LnWo5xdt5QyC5SUq0hbg09nAW2K1O1NLJfYLrz3r5165KX/7gQEiIR50kz9mBZl08hCunvgRxyRAAwMXTzf25rjN4BpVmunVEUgBmHGR2d5nht+Vzq1QbtBcEwic4HqBWMMGj90dLwILVd0tapplxu4J2lRAgEpW1yLNHPdgmYCA1BS4urb1LmCaUDTC7I8ToSDsexLbmlVuYoOmx+4IlzdZGWV51fl9B7gAktxPdg5qra2UZ9Y57+gJypVJXOtNgJRSL3JjP7XDgYo8bUtTEA6/4vTTYBJLA4CBJ7oXStz8=', 'X-Requested-With':'XMLHttpRequest', 'Zx-Open-Url':'https://aiqicha.baidu.com/company_detail_28684316400936', 'Sec-Ch-Ua-Platform':'Windows', 'Sec-Fetch-Site':'same-origin', 'Sec-Fetch-Mode':'cors', 'Sec-Fetch-Dest':'empty', 'Referer':'https://aiqicha.baidu.com/company_detail_28684316400936', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh,en-US;q=0.9,en;q=0.8,zh-CN;q=0.7', 'Connection':'close', }
def input_data(date): with open("result.txt", mode="a+") as fd: fd.write(date + "\n")
# first_step:爬取相关公司旗下控股公司名称,输出到当前目录的result,自己在用pycharm的正则表达式处理提取出来保存即可 def get_date(): try: for i in range(1,451): url = "https://aiqicha.baidu.com/detail/holdsAjax?pid=28684316400936&p={}&size=10&confirm=".format(i) respond = requests.get(url=url,headers=header) time.sleep(0.5) input_data(respond.text) print("爬取第{}条完毕,成功入库".format(i)) except Exception as err: print(err)
if __name__ == "__main__": get_date()
|