1 # -*- coding:utf-8 -*- 2 import requests 3 import re 4 import json 5 import codecs 6 from requests.exceptions import RequestException 7 from multiprocessing import Pool 8 9 headers = {10 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'11 }12 13 def get_one_page(url):14 try:15 response = requests.get(url,headers=headers)16 if response.status_code == 200:17 return response.text18 return None19 except RequestException:20 return None21 22 def parse_one_page(html):23 pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*? (.*?).*?star">(.*?) .*?releasetime">(.*?) .*?integer">(.*?).*?fraction">(.*?).*? ', re.S)24 items = re.findall(pattern, html)25 for item in items:26 yield {27 'index': item[0],28 'image': item[1],29 'title': item[2],30 'actor': item[3].strip()[3:],31 'time': item[4].strip()[5:],32 'score': item[5] + item[6]33 }34 35 def save_to_file(content):36 with codecs.open('result.txt', 'a', 'utf-8') as f:37 f.write(json.dumps(content, ensure_ascii=False) + '\n')38 39 def main(offset):40 url = 'http://maoyan.com/board/4?offset=' + str(offset)41 html = get_one_page(url)42 for item in parse_one_page(html):43 print json.dumps(item, ensure_ascii=False, encoding='utf-8')44 save_to_file(item)45 46 if __name__ == '__main__':47 pool = Pool()48 pool.map(main, [i*10 for i in range(10)])