爬取(明星网)明星面部数据

爬取(明星网)明星面部数据

from bs4 import BeautifulSoup
import os
import requests
import time

1 下载数据

1.1 请求分析

  • Request
GET /upload/thumb/2015/11-16/0-uwo1Wk.jpg HTTP/1.1
Host: img.mingxing.com
Referer:http://img.mingxing.com//mingxing//20181015/88aa35c304dc06e822bb2efdd33497a5.jpg
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400
def get_img(url, path):
    
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400",
        "Referer": url,
    }

    response = requests.get(url, headers=headers)
    # print(response.content)
    
    with open(path, "wb") as fw:
        fw.write(response.content)
        
if __name__ == "__main__":
    url = "http://img.mingxing.com//mingxing//20181015/88aa35c304dc06e822bb2efdd33497a5.jpg"
    path = "./dataset/tmp.jpg"
    
    get_img(url, path)

2 明星列表页面

  • Request
GET /ziliao/index?&p=1 HTTP/1.1
Host: www.mingxing.com
Connection: keep-alive
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400
Upgrade-Insecure-Requests: 1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cookie: __51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21

2.1 单页明星列表

URL_MINGXING_CELEBRITY_LIST = "http://www.mingxing.com/ziliao/index"
def get_celebrities_one_page(url, idx_page):
    
    headers={
        "Connection": "keep-alive",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400",
        "Upgrade-Insecure-Requests": "1",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cookie": "__51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21",
    }

    params = {
        "p": idx_page
    }
    response = requests.get(url, params=params, headers=headers)
    html = response.text
    # print(html)
    
    soup = BeautifulSoup(html, 'lxml')
    # print(soup.find("div", class_="page_starlist").find_all("img"))
    
    lst_celebrities = []
    for item in soup.find("div", class_="page_starlist").find_all("img"):
        lst_celebrities.append({"name": item.get("alt").strip(),
                                "url": "http://www.mingxing.com" + item.find_parent("a").get("href"),
                                "img_urls": [item.get("src")]})
        # print(item.find_parent("a")["href"])
        # print(item["src"], item["alt"])
        
    return lst_celebrities

if __name__ == "__main__":
    idx_page = 1
    print(get_celebrities_one_page(URL_MINGXING_CELEBRITY_LIST, idx_page))

2.2 多页明星列表

NUM_PAGES = 10
def get_celebrities(url, num_pages):
    
    lst_celebrities = []
    for idx_page in range(1, num_pages):
        lst_celebrities.extend(
            get_celebrities_one_page(url, idx_page))
        time.sleep(3)
    return lst_celebrities

if __name__ == "__main__":
    lst_celebrities = get_celebrities(URL_MINGXING_CELEBRITY_LIST, NUM_PAGES)
    print(lst_celebrities)

[{‘name’: ‘鹿晗’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/luhan.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/6/17097.jpg’]}, {‘name’: ‘迪丽热巴’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/dilireba.html’, ‘img_urls’:

[‘http://img.mingxing.com/upload/thumb/5/14274.jpg’]}, {‘name’: ‘王艺洁’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/wangyijie.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/5/14276.jpg’]}, {‘name’: ‘段林希’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/duanlinxi.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/5/14277.jpg’]}]

3 明星页面

GET /mingxing/index/name/luhan.html HTTP/1.1
Host: www.mingxing.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Referer: http://www.mingxing.com/ziliao/index?&p=1
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cookie: __51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A29%7D; __tins__18838395=%7B%22sid%22%3A%201553844269026%2C%20%22vd%22%3A%201%2C%20%22expires%22%3A%201553846069026%7D; __51laig__=30; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553843231
def get_celebrity_img_urls(url):
    
    headers={
        "Connection": "keep-alive",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400",
        "Upgrade-Insecure-Requests": "1",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cookie": "__51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21",
        # "Referer": url,
    }

    response = requests.get(url, headers=headers)
    html = response.text
    
    soup = BeautifulSoup(html, 'lxml')
    
    lst_imgs = []
    for item in soup.find("ul", class_="page_starphoto").find_all("img"):
        lst_imgs.append(item["src"])
        # print(item["src"])
        
    return lst_imgs

if __name__ == "__main__":
    
    get_celebrity_img_urls("http://www.mingxing.com/mingxing/index/name/luhan.html")

4 创建明星面部数据集

if __name__ == "__main__":
    
    NUM_PAGES = 10
    DATASET_PATH = "./dataset"
    
    # 明星列表
    lst_celebrities = get_celebrities(URL_MINGXING_CELEBRITY_LIST, NUM_PAGES)
    
    for celebrity in lst_celebrities:
        
        # 明星文件夹
        celebrity_dir = os.path.join(DATASET_PATH, celebrity["name"])
        print("*" * 10)
        print("celebrity: {}".format(celebrity["name"]))
        
        if not os.path.exists(celebrity_dir):
            os.makedirs(celebrity_dir)
            
        # 明星页面
        celebrity["img_urls"].extend(get_celebrity_img_urls(celebrity["url"]))
        
        idx_img = 0
        for img_url in celebrity["img_urls"]:
            
            idx_img += 1
            img_path = os.path.join(celebrity_dir, "{:04d}.jpg".format(idx_img))
            get_img(img_url, img_path)
            print("download {} ---> {}".format(img_url, img_path))
            time.sleep(3)

**********
celebrity: 鹿晗
download http://img.mingxing.com/upload/thumb/6/17097.jpg ---> ./dataset\鹿晗\0001.jpg
download http://img.mingxing.com/mingxing//20180928/2e8dc41ba5f72d2e0ed005541a515a54.jpg ---> ./dataset\鹿晗\0002.jpg
download http://img.mingxing.com/mingxing//20180319/c84bf559d0dd0e2fae005f84a4016f6c.jpg ---> ./dataset\鹿晗\0003.jpg
download http://img.mingxing.com/upload/thumb/2017/06-28/0-5QoNse.jpg ---> ./dataset\鹿晗\0004.jpg
download http://img.mingxing.com/upload/thumb/2017/05-02/0-gOgsmr.jpg ---> ./dataset\鹿晗\0005.jpg
download http://img.mingxing.com/upload/thumb/2016/12-20/0-2GSIij.jpg ---> ./dataset\鹿晗\0006.jpg
download http://img.mingxing.com/upload/thumb/2016/07-07/0-yaTLqz.jpg ---> ./dataset\鹿晗\0007.jpg
download http://img.mingxing.com/upload/thumb/2016/04-21/0-iDv7Fj.jpg ---> ./dataset\鹿晗\0008.jpg
download http://img.mingxing.com/upload/thumb/2016/04-11/0-FTCO8H.jpg ---> ./dataset\鹿晗\0009.jpg
download http://img.mingxing.com/upload/thumb/2016/03-21/0-op5Sbt.jpg ---> ./dataset\鹿晗\0010.jpg
download http://img.mingxing.com/upload/thumb/2015/12-30/0-08NWI0.jpg ---> ./dataset\鹿晗\0011.jpg
download http://img.mingxing.com/upload/thumb/2015/12-08/0-wlgnGF.jpg ---> ./dataset\鹿晗\0012.jpg
download http://img.mingxing.com/upload/thumb/2015/11-16/0-uwo1Wk.jpg ---> ./dataset\鹿晗\0013.jpg
**********
celebrity: 迪丽热巴
download http://img.mingxing.com/content/20180103/535f03beaa9b7f0cb3c6f2f302886bf8.jpg ---> ./dataset\迪丽热巴\0001.jpg
download http://img.mingxing.com/mingxing//20181015/14b77dfea0cad1360955d818fcbb0de6.jpg ---> ./dataset\迪丽热巴\0002.jpg
download http://img.mingxing.com/mingxing//20180921/28e35a28498d760e908abce74fd40f5f.jpg ---> ./dataset\迪丽热巴\0003.jpg
download http://img.mingxing.com/mingxing//20180726/17702f5a9b8b998cbb0c70c260b40ad3.gif ---> ./dataset\迪丽热巴\0004.jpg
download http://img.mingxing.com/mingxing//20180620/ea20b15f13f6b34d1b4764553bfba7a9.png ---> ./dataset\迪丽热巴\0005.jpg
download http://img.mingxing.com/mingxing//20180417/985a84ccae9646f31f4dd717ccd40508.jpg ---> ./dataset\迪丽热巴\0006.jpg
download http://img.mingxing.com/mingxing//20180411/5376e604692d6fb42ae7a48e73143eb8.jpg ---> ./dataset\迪丽热巴\0007.jpg
download http://img.mingxing.com/mingxing/20180301/bdd3cbbf262d7793f21ed10975744c22.jpg ---> ./dataset\迪丽热巴\0008.jpg
download http://img.mingxing.com/mingxing/20180301/3418a7189704f4e68f81a29b4320af87.jpg ---> ./dataset\迪丽热巴\0009.jpg
download http://img.mingxing.com/mingxing/20180227/d6aa477ed34271c06fe9edb4dccc9e94.jpg ---> ./dataset\迪丽热巴\0010.jpg
download http://img.mingxing.com/mingxing/20180227/92dccee3c3ab96b8aae57f2f0469b1c2.jpg ---> ./dataset\迪丽热巴\0011.jpg
download http://img.mingxing.com/mingxing/20180226/0fc7ff656cabc975cbb349daeb6ee793.jpg ---> ./dataset\迪丽热巴\0012.jpg
download http://img.mingxing.com/mingxing/20180225/45a68453086b2307eaf10b7921b7e199.jpg ---> ./dataset\迪丽热巴\0013.jpg

...

celebrity: 约翰尼·德普
download http://img.mingxing.com/upload/thumb/5/14261.jpg ---> ./dataset\约翰尼·德普\0001.jpg
download http://img.mingxing.com/upload/thumb/2016/05-24/0-re7Tem.jpg ---> ./dataset\约翰尼·德普\0002.jpg
download http://img.mingxing.com/upload/thumb/2016/04-13/0-X6RYXs.jpg ---> ./dataset\约翰尼·德普\0003.jpg
download http://img.mingxing.com/upload/thumb/2016/03-25/0-bxK5os.jpg ---> ./dataset\约翰尼·德普\0004.jpg
download http://img.mingxing.com/upload/thumb/2016/03-25/0-h77lr9.jpg ---> ./dataset\约翰尼·德普\0005.jpg
download http://img.mingxing.com/upload/thumb/2016/03-17/0-U3Y3EK.jpg ---> ./dataset\约翰尼·德普\0006.jpg
download http://img.mingxing.com/upload/thumb/2016/03-17/0-WUdojP.jpg ---> ./dataset\约翰尼·德普\0007.jpg
download http://img.mingxing.com/upload/thumb/2016/03-17/0-ghntJ4.jpg ---> ./dataset\约翰尼·德普\0008.jpg
download http://img.mingxing.com/upload/thumb/2016/02-26/0-G2Th8a.jpg ---> ./dataset\约翰尼·德普\0009.jpg
download http://img.mingxing.com/upload/thumb/2016/02-23/0-cARUg7.jpg ---> ./dataset\约翰尼·德普\0010.jpg
download http://img.mingxing.com/upload/thumb/2016/02-18/0-DLYZNo.jpg ---> ./dataset\约翰尼·德普\0011.jpg
download http://img.mingxing.com/upload/thumb/2016/01-29/0-Pe5YMh.jpg ---> ./dataset\约翰尼·德普\0012.jpg
**********
celebrity: 雨果·维文
download http://img.mingxing.com/upload/thumb/5/14262.jpg ---> ./dataset\雨果·维文\0001.jpg
download http://img.mingxing.com/upload/thumb/2016/04-13/0-Pm9m6p.jpg ---> ./dataset\雨果·维文\0002.jpg
download http://img.mingxing.com/upload/thumb/2016/04-13/0-kqjoN7.jpg ---> ./dataset\雨果·维文\0003.jpg
download http://img.mingxing.com/upload/thumb/2016/04-08/0-03NXtB.jpg ---> ./dataset\雨果·维文\0004.jpg
download http://img.mingxing.com/upload/thumb/2016/03-30/0-TJRqeD.jpg ---> ./dataset\雨果·维文\0005.jpg
download http://img.mingxing.com/upload/thumb/2016/02-26/0-Wuurq1.jpg ---> ./dataset\雨果·维文\0006.jpg
download http://img.mingxing.com/upload/thumb/2016/02-18/0-4fqOgM.jpg ---> ./dataset\雨果·维文\0007.jpg
**********
celebrity: 希亚·拉博夫

你可能感兴趣的:(爬取(明星网)明星面部数据)