淘女郎

import re,os,random,time,requests
from urllib import request
from bs4 import BeautifulSoup
def h(url):
    head = [
    {"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"},
    {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)"},
    {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"},
    {"User-Agent": "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)"},
    {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)"},
    {"User-Agent": "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)"},
    {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)"},
    {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)"},
    {"User-Agent": "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6"},
    {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1"},
    {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0"},
    {"User-Agent": "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"},
    {"User-Agent": "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6"},
    {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"},
    {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20"},
    {"User-Agent": "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"},
    {"User-Agent": "Mozilla/5.0 (Android; Mobile; rv:27.0) Gecko/27.0 Firefox/27.0"},
    {"User-Agent": "BlackBerry9700/5.0.0.862 Profile/MIDP-2.1 Configuration/CLDC-1.1 VendorID/331 UNTRUSTED/1.0 3gpp-gba"},
    {"User-Agent": "Mozilla/5.0 (BlackBerry; U; BlackBerry 9930; en-US) AppleWebKit/534.11+ (KHTML, like Gecko) Version/7.0.0.241 Mobile Safari/534.11+"},
    {"User-Agent": "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; zh-TW) AppleWebKit/534.8+ (KHTML, like Gecko) Version/6.0.0.448 Mobile Safari/534.8+"},
    {"User-Agent": "Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"},
    {"User-Agent": "Mozilla/5.0 (Linux; U; Android 2.2; en-us; SCH-I800 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"},
    {"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"},
    {"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)"},
    {"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; Touch)"},
    {"User-Agent": "Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B314 Safari/531.21.10"},
    {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3"},
    {"User-Agent": "Opera/9.80 (J2ME/MIDP; Opera Mini/9.80 (J2ME/22.478; U; en) Presto/2.5.25 Version/10.54"},
    {"User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us; Silk/1.1.0-80) AppleWebKit/533.16 (KHTML, like Gecko) Version/5.0 Safari/533.16 Silk-Accelerated=true"},
    {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"},
    {"User-Agent": "Mozilla/5.0 (Linux; Android 4.4.4; Nexus 5 Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.117 Mobile Safari/537.36"},
    {"User-Agent": "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19"},
    {"User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"},
    {"User-Agent": "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/12.0.024; Profile/MIDP-2.1 Configuration/CLDC-1.1; en-us) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.12344"},
    {"User-Agent": "Mozilla/5.0 (X11; U; Linux armv7l; no-NO; rv:1.9.2.3pre) Gecko/20100723 Firefox/3.5 Maemo Browser 1.7.4.8 RX-51 N900"},
    {"User-Agent": "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.0.1; en-US) AppleWebKit/535.8+ (KHTML, like Gecko) Version/7.2.0.1 Safari/535.8+"},
    {"User-Agent": "Mozilla/5.0 (PLAYSTATION 3 4.60) AppleWebKit/531.22.8 (KHTML, like Gecko)"},
    {"User-Agent": "Mozilla/5.0 (PlayStation Vita 3.12) AppleWebKit/536.26 (KHTML, like Gecko) Silk/3.2"},
    {"User-Agent": "Mozilla/5.0 (Linux; Android 4.4.2; en-us; SAMSUNG SCH-I545 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Version/1.5 Chrome/28.0.1500.94 Mobile Safari/537.36"},
    {"User-Agent": "Mozilla/5.0 (Linux; Android 4.1.2; GT-I8190 Build/JZO54K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.117 Mobile Safari/537.36"},
    {"User-Agent": "Mozilla/5.0 (Linux; Android 4.4.2; en-gb; SAMSUNG SM-G900F Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Version/1.6 Chrome/28.0.1500.94 Mobile Safari/537.36"},
    {"User-Agent": "Mozilla/5.0 (SAMSUNG; SAMSUNG-GT-S8530/S8530DDLC2; U; Bada/2.0; en-us) AppleWebKit/534.20 (KHTML, like Gecko) Dolfin/3.0 Mobile WVGA SMM-MMS/1.2.0 OPN-B"},
    {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1"},
    {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"},
    {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; SGH-i917)"},
    {"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; ARM; Trident/6.0)"},
    {"User-Agent": "Mozilla/5.0 (iPhone; U; CPU iPhone OS) (compatible; Googlebot-Mobile/2.1; http://www.google.com/bot.html)"},
    {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"},
    {"User-Agent": "DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)"},
    {"User-Agent": "SAMSUNG-SGH-I617/UCHJ1 Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 7.11)"},
    {"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; IEMobile 8.12; MSIEMobile 6.0) 320x240; VZW; UTStar-XV6175.1; Windows Mobile 6.5 Standard;"},
    {"User-Agent": "Opera/9.80 (Android 2.3.3; Linux; Opera Mobi/ADR-1202011015; U; en) Presto/2.9.201 Version/11.50"},
    {"User-Agent": "Opera/9.80 (BREW; Opera Mini/5.0/27.2370; U; en) Presto/2.8.119 240X320 Samsung SCH-U380"},
    {"User-Agent": "Mozilla/5.0 (Windows; U; Win 9x 4.90; en-GB; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1"},
    {"User-Agent": "Mozilla/5.0 (X11; U; SunOS sun4u; en-US; rv:1.6) Gecko/20040503"},
]
    headers=random.choice(head)
    req=request.Request(url=url,headers=headers)
    res=request.urlopen(req)
    data=res.read()
    data=data.decode('gbk')
    return data
def urlall():
    url='https://mm.taobao.com/json/request_top_list.htm?page='
    l=set()
    for i in range(1,2):
        url=url+str(i)
        data=h(url)
        soup=BeautifulSoup(data,'lxml')
        data=soup.find_all("p","top")
        for i in data:
            w=i.a["href"].find('=')
            user_id=i.a["href"][w+1:]
            l.add(user_id)
    return l
def jbxx(user_id):
url = 'https://mm.taobao.com/self/info/model_info_show.htm?user_id=' + str(user_id)
data=h(url)
bs2 = BeautifulSoup(data, 'lxml')
base_info = bs2.find('ul', 'mm-p-info-cell clearfix')
info_list = base_info('span')
result = []
result.append('昵称:' + info_list[0].text)
result.append('生日:' + info_list[1].text.strip())
result.append('所在城市:' + info_list[2].text)
result.append('职业:' + info_list[3].text)
result.append('血型:' + info_list[4].text)
result.append('学校/专业:' + info_list[5].text.strip())
result.append('风格:' + info_list[6].text)
result.append('身高:' + base_info.find('li', 'mm-p-small-cell mm-p-height').find('p').text)
result.append('体重:' + base_info.find('li', 'mm-p-small-cell mm-p-weight').find('p').text)
result.append('三围:' + base_info.find('li', 'mm-p-small-cell mm-p-size').find('p').text)
result.append('罩杯:' + base_info.find('li', 'mm-p-small-cell mm-p-bar').find('p').text)
result.append('鞋码:' + base_info.find('li', 'mm-p-small-cell mm-p-shose').find('p').text)
print ('资料收集完毕,正在保存她的个人资料……')
name='f://淘女郎//%s'%result[0]
if not os.path.isdir(name):
os.mkdir(name)
#with open(name+'//info.txt','w') as f:
# f.write('\r\n'.join(result).encode('utf-8'))
#filename = '%s/%s.txt' % (mm_folder, name)
print(result)
print ('保存完毕!')
imgurl = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20=' + str(user_id)
imgdata=h(imgurl)
bs3=BeautifulSoup(imgdata,'lxml')
album_total_page = int(bs3.find('input', id='J_Totalpage')['value'])
for album_page in range(1, album_total_page + 1):
url = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%%20=%s&page=%d' % (user_id, album_page)
bs3 = BeautifulSoup(requests.get(url).text, 'lxml')
album_count = 1
for album_area in bs3('div', 'mm-photo-cell-middle'):
# 获取相册的链接、id、名称和照片数
album_url = 'https:' + album_area.find('h4').find('a')['href']
album_id = re.search(r'album_id=(\d+)', album_url).group(1)
album_name = album_area.find('h4').find('a').text.strip()
pic_num = album_area.find('span', 'mm-pic-number').text
pic_num = re.search(r'\d+', pic_num).group(0)
print ('现在开始爬取她的第%d个相册,相册名为:《%s》(%s张)……' % (album_count, album_name, pic_num))

# 根据照片数计算总页数
total_page = int(pic_num) // 16 + 1            
# 以相册名新建文件夹
album_folder ='%s//%s'%(name,album_name)

if not os.path.isdir(album_folder):
os.mkdir(album_folder) #'%s/%s' % (mm_folder, album_name)
else:continue
#create_folder(album_folder)
for page in range(1, total_page + 1):
url = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=%s&album_id=%s&page=%s' % (user_id, album_id, page)
json = requests.get(url).json()
pic_count=1
for pic in json['picList']:
print ('现在开始下载该相册的第%d张照片……' % pic_count)
pic_url = 'https:' + pic['picUrl']
pic_url = re.sub(r'290', '620', pic_url)
filename = '%s//%s.jpg' % (album_folder, pic_count)
with open('%s//%s.jpg'%(album_folder,pic_count), 'wb') as f:
f.write(requests.get(pic_url).content)
print ('下载完毕!')
pic_count += 1
l=urlall()
for i in l:
jbxx(i)

你可能感兴趣的:(淘女郎)