import jsonimport reimport requestsfrom requests import RequestExceptionfrom multiprocessing import Pool #引入进程池def get_page(url):#获取网页 try: headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'} response=requests.get(url,headers=headers) if response.status_code==200: return response.text return None except RequestException: return Nonedef parse_one_page(html):#正则表达式提取 pattern=re.compile('
.*?board-index.*?">(\d+?).*?title="(.*?)".*? .*?"star">(.*?) .*?"releasetime">(.*?) .*?integer">(.*?)' '.*?fraction">(\d+?).*? ',re.S) items=re.findall(pattern,html) for item in items: yield { '排名:':item[0], '电影:':item[1], '图片:':item[2], '主演:':item[3].strip()[3:], '上映时间:':item[4].strip()[5:], '评分:':item[5]+item[6] }def write_file(content):#写入文件 with open('movie.txt','a',encoding='utf-8') as f:#以utf-8编码新建 f.write(json.dumps(content,ensure_ascii=False)+'\n')#不转为unicode编码def main(page): url='http://maoyan.com/board/4' html=get_page(url=url+'?offset='+str(page*10)) # print(html) items=parse_one_page(html) for item in items: write_file(item)if __name__ == '__main__': #单进程 ''' for i in range(10): main(i) ''' #多进程 pool=Pool() pool.map(main,[i for i in range(10)])