from bs4 import BeautifulSoup
import os
import requests
import time
GET /upload/thumb/2015/11-16/0-uwo1Wk.jpg HTTP/1.1
Host: img.mingxing.com
Referer:http://img.mingxing.com//mingxing//20181015/88aa35c304dc06e822bb2efdd33497a5.jpg
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400
def get_img(url, path):
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400",
"Referer": url,
}
response = requests.get(url, headers=headers)
# print(response.content)
with open(path, "wb") as fw:
fw.write(response.content)
if __name__ == "__main__":
url = "http://img.mingxing.com//mingxing//20181015/88aa35c304dc06e822bb2efdd33497a5.jpg"
path = "./dataset/tmp.jpg"
get_img(url, path)
GET /ziliao/index?&p=1 HTTP/1.1
Host: www.mingxing.com
Connection: keep-alive
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400
Upgrade-Insecure-Requests: 1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cookie: __51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21
URL_MINGXING_CELEBRITY_LIST = "http://www.mingxing.com/ziliao/index"
:class="page_starlist",明星列表
->
-->-
--->:明星页面url
---->
----->:src - 明星图片url,alt = 明星姓名
def get_celebrities_one_page(url, idx_page):
headers={
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400",
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cookie": "__51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21",
}
params = {
"p": idx_page
}
response = requests.get(url, params=params, headers=headers)
html = response.text
# print(html)
soup = BeautifulSoup(html, 'lxml')
# print(soup.find("div", class_="page_starlist").find_all("img"))
lst_celebrities = []
for item in soup.find("div", class_="page_starlist").find_all("img"):
lst_celebrities.append({"name": item.get("alt").strip(),
"url": "http://www.mingxing.com" + item.find_parent("a").get("href"),
"img_urls": [item.get("src")]})
# print(item.find_parent("a")["href"])
# print(item["src"], item["alt"])
return lst_celebrities
if __name__ == "__main__":
idx_page = 1
print(get_celebrities_one_page(URL_MINGXING_CELEBRITY_LIST, idx_page))
2.2 多页明星列表
NUM_PAGES = 10
def get_celebrities(url, num_pages):
lst_celebrities = []
for idx_page in range(1, num_pages):
lst_celebrities.extend(
get_celebrities_one_page(url, idx_page))
time.sleep(3)
return lst_celebrities
if __name__ == "__main__":
lst_celebrities = get_celebrities(URL_MINGXING_CELEBRITY_LIST, NUM_PAGES)
print(lst_celebrities)
[{‘name’: ‘鹿晗’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/luhan.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/6/17097.jpg’]}, {‘name’: ‘迪丽热巴’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/dilireba.html’, ‘img_urls’:
…
[‘http://img.mingxing.com/upload/thumb/5/14274.jpg’]}, {‘name’: ‘王艺洁’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/wangyijie.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/5/14276.jpg’]}, {‘name’: ‘段林希’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/duanlinxi.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/5/14277.jpg’]}]
3 明星页面
GET /mingxing/index/name/luhan.html HTTP/1.1
Host: www.mingxing.com
Connection: keep-alive
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Referer: http://www.mingxing.com/ziliao/index?&p=1
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cookie: __51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A29%7D; __tins__18838395=%7B%22sid%22%3A%201553844269026%2C%20%22vd%22%3A%201%2C%20%22expires%22%3A%201553846069026%7D; __51laig__=30; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553843231
:class="page_starphoto",明星列表
->-
-->
--->
---->:src - 明星图片url
def get_celebrity_img_urls(url):
headers={
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400",
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cookie": "__51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21",
# "Referer": url,
}
response = requests.get(url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
lst_imgs = []
for item in soup.find("ul", class_="page_starphoto").find_all("img"):
lst_imgs.append(item["src"])
# print(item["src"])
return lst_imgs
if __name__ == "__main__":
get_celebrity_img_urls("http://www.mingxing.com/mingxing/index/name/luhan.html")
4 创建明星面部数据集
if __name__ == "__main__":
NUM_PAGES = 10
DATASET_PATH = "./dataset"
# 明星列表
lst_celebrities = get_celebrities(URL_MINGXING_CELEBRITY_LIST, NUM_PAGES)
for celebrity in lst_celebrities:
# 明星文件夹
celebrity_dir = os.path.join(DATASET_PATH, celebrity["name"])
print("*" * 10)
print("celebrity: {}".format(celebrity["name"]))
if not os.path.exists(celebrity_dir):
os.makedirs(celebrity_dir)
# 明星页面
celebrity["img_urls"].extend(get_celebrity_img_urls(celebrity["url"]))
idx_img = 0
for img_url in celebrity["img_urls"]:
idx_img += 1
img_path = os.path.join(celebrity_dir, "{:04d}.jpg".format(idx_img))
get_img(img_url, img_path)
print("download {} ---> {}".format(img_url, img_path))
time.sleep(3)
**********
celebrity: 鹿晗
download http://img.mingxing.com/upload/thumb/6/17097.jpg ---> ./dataset\鹿晗\0001.jpg
download http://img.mingxing.com/mingxing//20180928/2e8dc41ba5f72d2e0ed005541a515a54.jpg ---> ./dataset\鹿晗\0002.jpg
download http://img.mingxing.com/mingxing//20180319/c84bf559d0dd0e2fae005f84a4016f6c.jpg ---> ./dataset\鹿晗\0003.jpg
download http://img.mingxing.com/upload/thumb/2017/06-28/0-5QoNse.jpg ---> ./dataset\鹿晗\0004.jpg
download http://img.mingxing.com/upload/thumb/2017/05-02/0-gOgsmr.jpg ---> ./dataset\鹿晗\0005.jpg
download http://img.mingxing.com/upload/thumb/2016/12-20/0-2GSIij.jpg ---> ./dataset\鹿晗\0006.jpg
download http://img.mingxing.com/upload/thumb/2016/07-07/0-yaTLqz.jpg ---> ./dataset\鹿晗\0007.jpg
download http://img.mingxing.com/upload/thumb/2016/04-21/0-iDv7Fj.jpg ---> ./dataset\鹿晗\0008.jpg
download http://img.mingxing.com/upload/thumb/2016/04-11/0-FTCO8H.jpg ---> ./dataset\鹿晗\0009.jpg
download http://img.mingxing.com/upload/thumb/2016/03-21/0-op5Sbt.jpg ---> ./dataset\鹿晗\0010.jpg
download http://img.mingxing.com/upload/thumb/2015/12-30/0-08NWI0.jpg ---> ./dataset\鹿晗\0011.jpg
download http://img.mingxing.com/upload/thumb/2015/12-08/0-wlgnGF.jpg ---> ./dataset\鹿晗\0012.jpg
download http://img.mingxing.com/upload/thumb/2015/11-16/0-uwo1Wk.jpg ---> ./dataset\鹿晗\0013.jpg
**********
celebrity: 迪丽热巴
download http://img.mingxing.com/content/20180103/535f03beaa9b7f0cb3c6f2f302886bf8.jpg ---> ./dataset\迪丽热巴\0001.jpg
download http://img.mingxing.com/mingxing//20181015/14b77dfea0cad1360955d818fcbb0de6.jpg ---> ./dataset\迪丽热巴\0002.jpg
download http://img.mingxing.com/mingxing//20180921/28e35a28498d760e908abce74fd40f5f.jpg ---> ./dataset\迪丽热巴\0003.jpg
download http://img.mingxing.com/mingxing//20180726/17702f5a9b8b998cbb0c70c260b40ad3.gif ---> ./dataset\迪丽热巴\0004.jpg
download http://img.mingxing.com/mingxing//20180620/ea20b15f13f6b34d1b4764553bfba7a9.png ---> ./dataset\迪丽热巴\0005.jpg
download http://img.mingxing.com/mingxing//20180417/985a84ccae9646f31f4dd717ccd40508.jpg ---> ./dataset\迪丽热巴\0006.jpg
download http://img.mingxing.com/mingxing//20180411/5376e604692d6fb42ae7a48e73143eb8.jpg ---> ./dataset\迪丽热巴\0007.jpg
download http://img.mingxing.com/mingxing/20180301/bdd3cbbf262d7793f21ed10975744c22.jpg ---> ./dataset\迪丽热巴\0008.jpg
download http://img.mingxing.com/mingxing/20180301/3418a7189704f4e68f81a29b4320af87.jpg ---> ./dataset\迪丽热巴\0009.jpg
download http://img.mingxing.com/mingxing/20180227/d6aa477ed34271c06fe9edb4dccc9e94.jpg ---> ./dataset\迪丽热巴\0010.jpg
download http://img.mingxing.com/mingxing/20180227/92dccee3c3ab96b8aae57f2f0469b1c2.jpg ---> ./dataset\迪丽热巴\0011.jpg
download http://img.mingxing.com/mingxing/20180226/0fc7ff656cabc975cbb349daeb6ee793.jpg ---> ./dataset\迪丽热巴\0012.jpg
download http://img.mingxing.com/mingxing/20180225/45a68453086b2307eaf10b7921b7e199.jpg ---> ./dataset\迪丽热巴\0013.jpg
...
celebrity: 约翰尼·德普
download http://img.mingxing.com/upload/thumb/5/14261.jpg ---> ./dataset\约翰尼·德普\0001.jpg
download http://img.mingxing.com/upload/thumb/2016/05-24/0-re7Tem.jpg ---> ./dataset\约翰尼·德普\0002.jpg
download http://img.mingxing.com/upload/thumb/2016/04-13/0-X6RYXs.jpg ---> ./dataset\约翰尼·德普\0003.jpg
download http://img.mingxing.com/upload/thumb/2016/03-25/0-bxK5os.jpg ---> ./dataset\约翰尼·德普\0004.jpg
download http://img.mingxing.com/upload/thumb/2016/03-25/0-h77lr9.jpg ---> ./dataset\约翰尼·德普\0005.jpg
download http://img.mingxing.com/upload/thumb/2016/03-17/0-U3Y3EK.jpg ---> ./dataset\约翰尼·德普\0006.jpg
download http://img.mingxing.com/upload/thumb/2016/03-17/0-WUdojP.jpg ---> ./dataset\约翰尼·德普\0007.jpg
download http://img.mingxing.com/upload/thumb/2016/03-17/0-ghntJ4.jpg ---> ./dataset\约翰尼·德普\0008.jpg
download http://img.mingxing.com/upload/thumb/2016/02-26/0-G2Th8a.jpg ---> ./dataset\约翰尼·德普\0009.jpg
download http://img.mingxing.com/upload/thumb/2016/02-23/0-cARUg7.jpg ---> ./dataset\约翰尼·德普\0010.jpg
download http://img.mingxing.com/upload/thumb/2016/02-18/0-DLYZNo.jpg ---> ./dataset\约翰尼·德普\0011.jpg
download http://img.mingxing.com/upload/thumb/2016/01-29/0-Pe5YMh.jpg ---> ./dataset\约翰尼·德普\0012.jpg
**********
celebrity: 雨果·维文
download http://img.mingxing.com/upload/thumb/5/14262.jpg ---> ./dataset\雨果·维文\0001.jpg
download http://img.mingxing.com/upload/thumb/2016/04-13/0-Pm9m6p.jpg ---> ./dataset\雨果·维文\0002.jpg
download http://img.mingxing.com/upload/thumb/2016/04-13/0-kqjoN7.jpg ---> ./dataset\雨果·维文\0003.jpg
download http://img.mingxing.com/upload/thumb/2016/04-08/0-03NXtB.jpg ---> ./dataset\雨果·维文\0004.jpg
download http://img.mingxing.com/upload/thumb/2016/03-30/0-TJRqeD.jpg ---> ./dataset\雨果·维文\0005.jpg
download http://img.mingxing.com/upload/thumb/2016/02-26/0-Wuurq1.jpg ---> ./dataset\雨果·维文\0006.jpg
download http://img.mingxing.com/upload/thumb/2016/02-18/0-4fqOgM.jpg ---> ./dataset\雨果·维文\0007.jpg
**********
celebrity: 希亚·拉博夫