//未使用scrapy 爬虫,然后少儿不宜视频 import requests #from lxml import etree #import pprint #requests库只负责发送HTTP请求和接收响应,它不会解析HTML或xml内容,所以如果想用xpath来解析HTML或xml,需要用到lxml或beautifulsoup这样的库 page=0 while True: page+=1 print('===正在爬取第{}页数据==='.format(page)) url="https://api.iwara.tv/search?type=video&page={}&query=%E5%8E%9F%E7%A5%9E".format(page) headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'} response1=requests.get(url=url,headers=headers) json_data=response1.json() video_data=json_data['results'] #pprint.pprint(video_data) #列表里包含字典,每个字典都是不同视频 #https://www.iwara.tv/video/%s/hmv for one_data in video_data: try: data1=one_data['title']+'.mp4'#标题 #data2="https://www.iwara.tv/video/%s/hmv"%one_data['id']#每个视频对应网页:不是视频地址,通过数据库search?type...找到对应网址(即url) data2 ="https://api.iwara.tv/video/%s"%one_data['id'] #每个视频对应url动态生成,通过视频所在网址数据库oze...中找到 #data3视频url就在HTML中,而data3是动态加载的,在json中,data3采用xpath提取 #这里也和先前一样,动态网址,无法用xpath获取到视频url地址并且观察到视频url地址不断变化,etree方法返回源代码不全等 #response2=requests.get(url=data2,headers=headers) #tree=etree.HTML(response2.text) #data3=tree.xpath("//div[@id='vjs_video_3']/video/@src")#以上代码只针对静态网页HTML提取,动态网站应找json #现在从json格式的url2中提取视频地址 response2=requests.get(url=data2,headers=headers) json_data2=response2.json() if 'fileUrl' in json_data2: data3=json_data2['fileUrl'] if data3 is None: #字符串'null'解析结果是None而不是null字符本身 #print('null') continue else: response3=requests.get(url=data3,headers=headers) json_data3=response3.json() if json_data3=='None': #print("NO files in 'fileUrl'") continue else: dict=json_data3[0] data4=dict['src']['view'] #print(data4) #continue else: #print("NO FOUND 'fileUrl'") continue except: break if data4: response4=requests.get(url='https:'+data4,headers=headers).content#视频数据:.content:取出二进制数据 #保存数据 #非法字符:'/','\\',':','*','?','"','|','<','>' safedata=data1.replace('|','_') safedata=safedata.replace('\\', '_') safedata=safedata.replace('/', '_') safedata=safedata.replace(':', '_') safedata=safedata.replace('?', '_') safedata=safedata.replace('*', '_') safedata=safedata.replace(':', '_') with open('videos2\\'+safedata,mode='wb') as f: print('正在保存:',data1) f.write(response4) else: print("NO VIDEO SOURCE FOUND") else: continue break