默认引入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
如下代码尽可能写在同一个文件内
数据来自usagov_bitly_data2012-03-16-1331923249.txt
path = 'usagov_bitly_data2012-03-16-1331923249.txt'
open(path).readline()
"""
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
"""
import json
records = [json.loads(line) for line in open(path)]
records[0]
"""
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
'c': 'US',
'nk': 1,
'tz': 'America/New_York',
'gr': 'MA',
'g': 'A6qOVH',
'h': 'wfLQtf',
'l': 'orofrog',
'al': 'en-US,en;q=0.8',
'hh': '1.usa.gov',
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
't': 1331923247,
'hc': 1331822918,
'cy': 'Danvers',
'll': [42.576698, -70.954903]}
"""
records[0]['tz'] # America/New_York
假设我们想要知道该数据集中最常出现的是哪个时区(即tz字段),得到答案的办法有很多。
首先,我们用列表推导式取出一组时区:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]
"""
['America/New_York',
'America/Denver',
'America/New_York',
'America/Sao_Paulo',
'America/New_York',
'America/New_York',
'Europe/Warsaw',
'',
'',
'']
"""
因为并不是所有的记录都有时区字段,所以在列表推导式末尾加上if
来进行判断
通过上表能发现前十个时区,有些是未知的数据。虽然可以过滤掉,但是暂时先留着。接下来,为了对时区进行计数,这里介绍两个办法:
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) # 所有的值均会被初始化为0
for x in sequence:
counts[x] += 1
return counts
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
counts = get_counts(time_zones)
counts['America/New_York'] # 1251
len(time_zones) # 3440
top_counts(count)
"""
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
"""
使用Python标准库内的collections.Counter类:
from collections import Counter
counts = Counter(time_zones) # 类似于2.1.1、 2.1.2
counts.most_common(10) # 类似于 2.1.3
"""
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
"""
DataFrame是pandas中最重要的数据结构,它用于将数据表示为一个表格。从一组原始记录中创建DataFrame是很简单的:
from pandas import DataFrame, Series
import json
records = [json.loads(line) for line in open(path)]
frame = DataFrame(records) # 此时frame的输出形式为摘要视图,主要用于较大的DataFrame对象。
tz_counts = frame['tz'].value_counts() # frame['tz']返回的Series对象有一个value_counts作用为获取该列数据并进行计数排序
ta_counts[:10]
"""
America/New_York 1251
521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Name: tz, dtype: int64
"""
接下来使用绘图库(即matplotlib)为这段数据生成一张图片。为此,我们先给记录中未知或缺失的时区填上一个替代值。fillna函数可以替换缺失值(NA),而未知值(空字符串)则可以通过布尔型数组索引加以替换:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts() # 查看tz列中有多少个不同的值,并计算每个不同值在该列中有多少重复值
tz_counts[:10]
"""
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
Name: tz, dtype: int64
"""
利用counts对象的plot方法即可得到一张水平条形图:
tz_counts[:10].plot(kind='barh', rot=0)
plt.show()
我们还可以对这种数据进行很多处理,比如说,a字段含有执行URL短缩操作的浏览器、设备、应用程序的相关信息。
frame['a'][1] # 'GoogleMaps/RochesterNY'
现在假设将Windows和非Windows用户对时区统计信息进行分解。我们假定只要agent字符串中含有“Windows”就认为该用户为Windows用户。由于有的agent缺失,所以首先将它们从数据中移除。
cframe = frame[frame.a.notnull()]
其次根据a值计算出各行是否是Windows:
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
operating_system[:5]
"""
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'],
dtype='
接下来就是根据时区和新得到的操作系统列表对数据进行分组了:
by_tz_os = cframe.groupby(['tz', operating_system])
然后通过size对分组结果进行计数(类似于上面的value_counts函数),并利用unstack对计数结果进行重塑:
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
"""
Not Windows Windows
tz
245.0 276.0
Africa/Cairo 0.0 3.0
Africa/Casablanca 0.0 1.0
Africa/Ceuta 0.0 2.0
Africa/Johannesburg 0.0 1.0
Africa/Lusaka 0.0 1.0
America/Anchorage 4.0 1.0
America/Argentina/Buenos_Aires 1.0 0.0
America/Argentina/Cordoba 0.0 1.0
America/Argentina/Mendoza 0.0 1.0
"""
最后,我们来选去最常出现的时区。为了达到这个目的,我根据agg_counts中的行数构造了一个间接索引数组:
# 用于按升序排列
indexer = agg_counts.sum(1).argsort()
indexer[:10]
"""
tz
24
Africa/Cairo 20
Africa/Casablanca 21
Africa/Ceuta 92
Africa/Johannesburg 87
Africa/Lusaka 53
America/Anchorage 54
America/Argentina/Buenos_Aires 57
America/Argentina/Cordoba 26
America/Argentina/Mendoza 55
dtype: int64
"""
然后通过take按照这个顺序截取了最后10行:
count_subset = agg_counts.take(indexer)[-10:]
count_subset
"""
No Windows Windows
tz
America/Sao_Paulo 13.0 20.0
Europe/Madrid 16.0 19.0
Pacific/Honolulu 0.0 36.0
Asia/Tokyo 2.0 35.0
Europe/London 43.0 31.0
America/Denver 132.0 59.0
America/Los_Angeles 130.0 252.0
America/Chicago 115.0 285.0
245.0 276.0
America/New_York 339.0 912.0
"""
生成堆积条形图
count_subset.plot(kind='barh', stacked=True)
plt.show()
在这张图中不容易看清楚较小分组中Windows用户的相对比例,因此我们可以将各行规范化为“总计为1”并重新绘制:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)