import requests from flask import json from requests.exceptions import RequestException import re from multiprocessing import Pool ''' Request+正则表达式抓取战神博伊卡图片 ''' ''' 获取第一页的内容 ''' def getOneContent(url,headers): try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None ''' 解析内容,根据正则表达式 ''' def parserContent(content): if content: # pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>' # +'.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?<dd>',re.S) # 字符串换行不需要添加“+”,上面这种写法是错误的。 pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>' '.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?</dd>',re.S) results = re.findall(pattern,content) # print(results) return results def processData(results): for result in results: yield { 'index':result[0], 'imgurl':result[1], 'name':result[2], 'star':result[3].strip()[3:], 'releasetime':result[4].strip()[5:], 'score':result[5]+result[6] } # print(result) def storeData(data): ''' 为了防止出现unicode码 :param data: 需要写入文本的数据 :return: 无返回值 ''' with open("mmovie.txt",'a',encoding='utf-8') as f: f.write(json.dumps(data,ensure_ascii=False)+'\n') f.close() def main(offset): url = 'http://maoyan.com/board/4?offset='+str(offset) headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'} html = getOneContent(url,headers=headers) # print(html) results = parserContent(html) for item in processData(results): storeData(item) if __name__ == '__main__': # for i in range(10): # main(i*10) pool = Pool() pool.map(main,[i*10 for i in range(10)]) 首发:传智播客人工智能+pathon培训学院 作者:http://战神博伊卡图片.tianfu2024.sbs/ | |