大数据 - 创造101 - 数据整理

数据源

1、小红书
2、百度百科-创造101
3、腾讯管网

101changtui.jpg

采集分析

这次有100多人数据需要采集，而且分主副页面，必须使用爬虫处理了。一个人一天20个，5天也能干完。
爬虫打算使用scrapy，文档多，使用方便，支持xpath和css语法，再加上正则表达式，基本上除了有反爬虫设置，否则没什么网页处理不了。
主要目标如下：
1、人名
2、公司
3、排名
4、身高
5、体重
6、英文名
7、图片
8、数据转存json和csv格式处理

项目开始

项目环境
1、python 3.6.3
2、scrapy 1.5.0
3、WIN10

创建项目

pip install scrapy
pip install xpinyin
scrapy startproject P101

编写爬虫

修改spiders下的P101.py，分两段分别编写
1、分析主页面
进入命令行模式，分析需要数据

scrapy shell http://v.qq.com/biu/101_star_web
>>>sel.xpath('//div[@class="list_item"]//a[contains(@href, "javascript:;")]/text()')
陈意涵
....
....
>>>

2、分析子页面
进入命令行模式，分析需要数据,EP1~EP10数据

scrapy shell http://v.qq.com/doki/star?id=1661556
>>>sel.xpath('//div[@id="101"]/@data-round').extract()
[',20,26,31,35,37,35,25,,']
>>>

3、爬虫源码

# -*- coding: utf-8 -*-
from scrapy import Spider, Request
from P101.items import DokiSlimItem
from P101.items import DokiItem
import json
from xpinyin import Pinyin
import urllib


class P101Spider(Spider):
    name = 'P101'
    allowed_domains = ['v.qq.com']
    start_urls = ['http://v.qq.com/biu/101_star_web']
    p101_url = 'http://v.qq.com/biu/101_star_web'
    # allowed_domains = ['127.0.0.1:8080']
    # p101_url = 'http://127.0.0.1:8080/rank.html'

    single_url = 'http://v.qq.com/doki/star?id={starid}'

    def start_requests(self):  # 将战队ID号取出，构建完整的战队详情页的URL，并使用parse_team函数解析
        yield Request(self.p101_url, self.parse_p101)

    def parse_p101(self, response):

        p = Pinyin()
        # sel.xpath('//div[@class="list_item"]//a[contains(@href, "javascript:;")]/text()')
        for divs in response.xpath('//div[@class="list_item"]'):
            item1 = DokiSlimItem()
            for name in divs.xpath('.//a[contains(@href, "javascript:;")]/text()'):
                print(name.extract())
                cnname = name.extract()
                engname = p.get_pinyin(cnname, '')
                item1['name'] = cnname
                item1['engname'] = engname
            for starid in divs.xpath('.//a[@class="pic"][contains(@href, "javascript:;")]/@data-starid'):
                print(starid.extract())
                item1['starid'] = starid.extract()
            for pic in divs.xpath('.//a[@class="pic"][contains(@href, "javascript:;")]/img/@src'):
                print(pic.extract())
                item1['pic'] = pic.extract()
                item1['images'] = engname + ".png"
                # strurl = urllib.parse.quote(pic.extract().replace('.', ''))
                # strurl = "http://127.0.0.1:8080"+strurl
                strurl = pic.extract()
                strurl = "http:"+strurl
                item1['image_urls'] = [strurl]
                yield item1

                # 构造队员信息URL，回调函数为parse_idol
                yield Request(self.single_url.format(starid=item1['starid']), self.parse_idol)

    def parse_idol(self, response):  # 将队员的信息存入Item
        p = Pinyin()
        item2 = DokiItem()
        starid = str(response.url).strip().split("id=")[-1]
        epsdata = response.xpath('//div[@id="101"]/@data-round').extract()
        item2["epsdata"] = epsdata[0]

        properties = response.xpath('//div[@class="wiki_info_1"]//div[@class="line"]')
        name = properties[0].xpath('.//span[@class="content"]/text()').extract()
        # item2["name"] = name[0]
        cnname = name[0]
        engname = p.get_pinyin(cnname, '')
        item2['name'] = cnname
        item2['engname'] = engname
        item2['starid'] = starid

        height = properties[5].xpath('.//span[@class="content"]/text()').extract()
        item2["height"] = height[0]
        weight = properties[6].xpath('.//span[@class="content"]/text()').extract()
        item2["weight"] = weight[0]
        hometown = properties[7].xpath('.//span[@class="content"]/text()').extract()
        item2["hometown"] = hometown[0]
        yield item2

优化爬虫

1、图片名称是什么？
网站图片是puui.qpic.cn/media_img/0/null1524465427/0，这是什么鬼？
获取下来需要修改成我们能找到的图片。打算使用名字拼音作为图片名字。

2、需要使用ImagesPipeline技术下载图片，当然如果你觉得麻烦，直接request也可以。
没太多难度，网上找个教程，添加进来就能用了。

3、源码

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from .items import DokiSlimItem
from scrapy import Request
from scrapy import log
import requests
import re
import logging
import json


def strip(path):
    """
    :param path: 需要清洗的文件夹名字
    :return: 清洗掉Windows系统非法文件夹名字的字符串
    """
    path = re.sub(r'[？\\*|“<>:/]', '', str(path))
    return path


class P101Pipeline(object):
    def process_item(self, item, spider):
        return item


class P101ImgDownloadPipeline(ImagesPipeline):
    default_headers = {
        'accept': 'image/webp,image/*,*/*;q=0.8',
        'accept-encoding': 'gzip, deflate, sdch, br',
        'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
#         'referer': 'http://puui.qpic.cn/media_img/0/',
        'referer': 'http://127.0.0.1:8080/',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
    }

    def file_path(self, request, response=None, info=None):
        """
        :param request: 每一个图片下载管道请求
        :param response:
        :param info:
        :param strip :清洗Windows系统的文件夹非法字符，避免无法创建目录
        :return: 每套图的分类目录
        """
        print('abc:')
        item = request.meta['item']
        folder = item
        print('folder:', folder)
        folder_strip = strip(folder)
        filename = u'{0}'.format(folder_strip)
        return filename

    def get_media_requests(self, item, info):
        if isinstance(item, DokiSlimItem):
            logging.debug("get_media_requests:"+item['image_urls'][0])
            print('item:', item)
            for image_url in item['image_urls']:
                self.default_headers['referer'] = image_url
#                 yield Request(image_url, headers=self.default_headers)
                logging.debug("get_media_requests url:"+image_url)
    #             referer = item['UserIcon']
                print('url:', image_url)
                yield Request(image_url, meta={'item': item['images']})

#         for image_url in item['image_urls']:
#             self.default_headers['referer'] = image_url
#             print('xxxx:'+image_url)
#             yield Request(image_url, headers=self.default_headers)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item


class JsonPipeline(object):

    def open_spider(self, spider):
        self.file = open('data.json', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

数据存储

存成json格式，写到pipeline文件里了

1、源码

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from .items import DokiSlimItem
from scrapy import Request
from scrapy import log
import requests
import re
import logging
import json


class JsonPipeline(object):

    def open_spider(self, spider):
        self.file = open('data.json', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

拿到的数据

{"name": "\u9a6c\u5174\u94b0", "engname": "maxingyu", "starid": "1661557", "pic": "//puui.qpic.cn/media_img/0/null1524465404/0", "images": "maxingyu.png", "image_urls": ["http://puui.qpic.cn/media_img/0/null1524465404/0"]}
{"name": "\u5218\u601d\u7ea4", "engname": "liusixian", "starid": "1642387", "pic": "//puui.qpic.cn/media_img/0/null1524465187/0", "images": "liusixian.png", "image_urls": ["http://puui.qpic.cn/media_img/0/null1524465187/0"]}
{"name": "\u5f20\u695a\u5bd2", "engname": "zhangchuhan", "starid": "1661544", "pic": "//puui.qpic.cn/media_img/0/null1524466277/0", "images": "zhangchuhan.png", "image_urls": ["http://puui.qpic.cn/media_img/0/null1524466277/0"]}
{"name": "\u5411\u4fde\u661f", "engname": "xiangyuxing", "starid": "1572221", "pic": "//puui.qpic.cn/media_img/0/null1524465963/0", "images": "xiangyuxing.png", "image_urls": ["http://puui.qpic.cn/media_img/0/null1524465963/0"]}
{"name": "\u5434\u831c", "engname": "wuqian", "starid": "1661559", "pic": "//puui.qpic.cn/media_img/0/null1524465836/0", "images": "wuqian.png", "image_urls": ["http://puui.qpic.cn/media_img/0/null1524465836/0"]}
{"name": "\u5c39\u854a", "engname": "yinrui", "starid": "1661563", "pic": "//puui.qpic.cn/media_img/0/null1524466237/0", "images": "yinrui.png", "image_urls": ["http://puui.qpic.cn/media_img/0/null1524466237/0"]}
{"epsdata": ",8,8,10,9,9,9,12,,", "name": "\u5085\u83c1", "engname": "fujing", "starid": "1661523", "height": "168", "weight": "46kg", "hometown": "\u4e0a\u6d77"}
{"epsdata": ",,94,90,55,36,23,2,,", "name": "\u738b\u83ca", "engname": "wangju", "starid": "1661570", "height": "165", "weight": "60kg", "hometown": "\u4e0a\u6d77"}
{"epsdata": ",14,16,17,29,25,26,26,,", "name": "\u5434\u6620\u9999", "engname": "wuyingxiang", "starid": "1512788", "height": "164", "weight": "64kg", "hometown": "\u5723\u4fdd\u7f57"}
{"epsdata": ",69,66,47,40,47,48,,,", "name": "\u52fe\u96ea\u83b9", "engname": "gouxueying", "starid": "1597083", "height": "164", "weight": "46kg", "hometown": "\u5317\u4eac"}
{"epsdata": ",42,43,42,48,50,51,,,", "name": "\u5f20\u6eaa", "engname": "zhangxi", "starid": "1661547", "height": "163", "weight": "45kg", "hometown": "\u6dc4\u535a"}
{"epsdata": ",45,50,39,53,54,58,,,", "name": "\u5c39\u854a", "engname": "yinrui", "starid": "1661563", "height": "166", "weight": "42kg", "hometown": "\u91cd\u5e86"}