1 # python3 2 # jiandan meizi tu 3 import urllib 4 import urllib.request as req 5 import os 6 import time 7 import random 8 9 10 def url_open(url):11 req1 = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/4.0'})12 req2 = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/4.1'})13 req3 = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/4.5'})14 req4 = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.1'})15 16 req_list = [req1, req2,req3, req4]17 response = urllib.request.urlopen(random.choice(req_list))18 html = response.read()19 # print ('url_open done!')20 return html21 22 def url_open2(url):23 req1 = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/4.0'})24 req2 = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/4.1'})25 req3 = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/4.5'})26 req4 = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.1'})27 req_list = [req1, req2,req3, req4]28 29 ip_list = ['117.135.251.136:82']30 ip = random.choice(ip_list)31 print (ip)32 33 proxy = req.ProxyHandler({ 'http': ip})34 # auth = req.HTTPBasicAuthHandler()35 opener = req.build_opener(proxy, req.HTTPHandler)36 req.install_opener(opener)37 conn = req.urlopen(random.choice(req_list))38 return_str = conn.read()39 return return_str40 41 def get_current_page(url):42 html = url_open2(url).decode('utf-8')43 a = html.find('current-comment-page') + 2344 b = html.find(']',a)45 return html[a:b]46 47 def find_imgs(url):48 html = url_open2(url).decode('utf-8')49 img_addrs = []50 a = html.find('img src="http')51 while a != -1: 52 b = html.find('.jpg',a, a+255)53 if b != -1:54 img_addrs.append(html[a+9:b+4])55 else:56 b = a + 1357 a = html.find('img src="http', b)58 return img_addrs59 60 def save_imgs(folder,img_addrs):61 for each in img_addrs:62 filename = each.split('/')[-1]63 with open(filename,'wb') as f:64 img = url_open2(each)65 f.write(img)66 67 68 def download_mm(folder = 'xx',pages = 300):69 # os.mkdir(folder)70 os.chdir(folder)71 72 url = 'http://jandan.net/ooxx/'73 current_page_num = int(get_current_page(url))74 for i in range(pages):75 print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),'current_page_num', current_page_num)76 if i%3 == 0:77 print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),"sleep 2 seconds...")78 time.sleep(2)79 current_page_num -= 180 page_url = url + 'page-' + str(current_page_num) + '#comments'81 img_addrs = find_imgs(page_url)82 save_imgs(folder, img_addrs)83 84 if __name__ == '__main__':85 download_mm()