1、该文本里,有多少个2012年11月发布的tweets。
目标文件:百度网盘
http://pan.baidu.com/s/1kU6X2GB
所求问题为:该文本里,有多少个2012年11月发布的tweets。 (要求:输出为一个整数。提示:请阅读python的time模块)
python源码:
root@kali:~/python/laowangpy/datadig# cat 3numberof201211.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import string
import time
text_userdata = open('twitterdata.txt').read()#对目标文本进行全部读取,并赋值
#print text_userdata
l = [n for n in text_userdata.split('\r\n')]#对目标文件使用\r\n进行分割,把分割好的每个元素加入列表n中
print l[0]#打印第一个元素信息
i = 0
n = []
m = []
while l[i] != "":#当列表l中的元素不等于空白,则进入
s = l[i].replace('"','')#对列表l中每个元素中的"符号使用空白替换
n.append(s)#加入列表n中
t = s.split(",")#对列表s使用,分割
m.append(t[6])#对列表t中第6个元素加入列表m中
f = open("test.txt","a")#打开文件test.txt
f.write(m[i])#对列表m每个元素写入文件test.txt中
f.close()
i = i + 1#循环加1
print m[0]
j = 0
yeararry = []
timestamparry = []
while m[j] != m[-1]:#对列表m循环元素时不等于最后一个呀元素是,则进入
timearry = time.strptime(m[j], "%Y-%m-%d %H:%M:%S")#对列表m每个元素转换成时间格式
timestamp = int(time.mktime(timearry))#对时间格式元素转成时间戳
timestamparry.append(timestamp)#对没有转成时间戳元素加入列表timestamparry
year = time.localtime(timestamparry[j])[0]#对每个时间戳的元素取出第一个年份的值
month = time.localtime(timestamparry[j])[1]#对每个时间戳的元素取出第二个月份的值
yeararry.append(str(year)+str(month))#对取出的整型年份值与整型月份值,强制转成字符型,再组合,再加入列表yeararry
j = j + 1
print [(x,yeararry.count(x)) for x in set(yeararry)]#对列表yeararry中每个元素去重统计出现频次统计
root@kali:~/python/laowangpy/datadig#
脚本运行结果:
root@kali:~/python/laowangpy/datadig# python 3numberof201211.py
"264345016313466880","28803555","てんじょう","","RT@h_ototake:はるかぜちゃんが殺人予告とも取れるツイートを受け、親御さんが警察に通報した。それを受けて、「だから小学生にネットなんかやらせるから…」という感想を漏らしている人が多いことに驚く。「退場すべきは、いじめられた側だ」というわけか。これでは、いじめ...","","2012-11-02 12:35:37","web","26597","","152963467","乙武 洋匡","","はるかぜちゃんが殺人予告とも取れるツイートを受け、親御さんが警察に通報した。それを受けて、「だから小学生にネットなんかやらせるから…」という感想を漏らしている人が多いことに驚く。「退場すべきは、いじめられた側だ」というわけか。これでは、いじめがはびこるわけだ。","","26597","","","264341471132528640","","264341471132528640","264345016313466880","","","","","","","","","","","","https://twitter.com/h_ototake/status/264341471132528640","https://twitter.com/mtenjo/status/264345016313466880"
2012-11-02 12:35:37
[('201411', 4), ('201412', 2), ('201312', 1), ('201311', 19), ('201310', 2), ('201210', 4), ('201211', 124)]
root@kali:~/python/laowangpy/datadig#
test.txt文件保存结果:
root@kali:~/python/laowangpy/datadig# cat test.txt
2012-11-02 12:35:372012-11-02 12:35:552012-11-02 13:02:582012-11-02 13:05:382012-11-02 13:07:302012-11-03 00:30:132012-11-03 01:34:142012-11-03 03:02:062012-11-04 04:17:552012-11-04 05:19:132012-11-04 06:18:202012-11-04 06:19:052012-11-04 06:22:332012-11-04 06:23:212012-11-02 00:26:102012-11-02 00:26:592012-11-02 00:30:212012-11-02 11:44:442012-11-02 13:59:462012-11-03 14:41:052012-11-03 21:43:112012-11-04 10:43:142012-11-03 22:00:092012-11-03 22:00:562012-11-03 22:02:252012-11-03 22:03:182012-11-03 22:05:572012-11-03 22:06:442012-11-03 22:07:252012-11-03 22:08:422012-11-03 22:09:282012-11-03 22:11:382012-11-04 05:07:292012-11-04 05:07:592012-11-03 03:11:192012-11-03 05:03:212012-11-03 12:57:092012-11-03 13:01:272012-11-03 13:03:162012-11-04 10:43:382012-11-04 10:45:352012-11-04 10:48:452012-11-04 06:25:562012-11-04 06:26:452012-11-04 06:27:042012-11-04 07:13:432012-11-04 07:19:532012-11-04 07:32:502012-11-01 17:36:522012-11-02 16:03:472012-11-02 16:04:392012-11-02 16:10:012012-11-02 16:38:572012-11-02 16:40:592012-11-02 16:44:172012-11-03 19:10:112012-11-03 19:11:042012-11-03 19:11:292012-11-03 19:27:312012-11-03 20:39:012012-11-04 00:39:292012-11-04 02:08:092012-11-04 02:08:492012-11-04 02:09:212012-11-04 02:10:002012-11-04 04:39:262012-11-04 08:39:142012-11-04 09:52:282012-11-04 09:53:092012-11-04 10:01:082012-11-04 05:13:072012-11-04 05:15:492012-11-03 11:21:232012-11-03 11:24:152012-11-03 11:31:142012-11-03 12:22:162012-11-03 13:39:292012-11-03 13:40:042012-11-03 13:57:542012-11-03 14:24:462012-11-03 15:03:112012-11-03 19:09:132012-11-03 20:23:572012-10-29 10:10:062012-10-30 06:41:592012-10-30 06:43:302012-10-30 06:45:332012-10-30 09:19:522012-11-01 02:26:282012-11-01 05:47:412012-11-02 07:25:332012-11-02 07:28:152012-11-02 07:48:132012-11-02 07:49:232012-11-03 02:54:542012-11-03 02:59:332012-11-03 23:54:512012-11-03 23:55:092012-11-04 00:25:462012-11-04 00:26:512012-11-04 00:37:452012-11-04 00:44:172012-11-04 01:00:042012-11-04 02:17:512012-11-04 03:33:482012-11-04 04:06:392012-11-04 05:21:152012-11-04 05:23:192012-11-04 05:25:582012-11-04 05:27:052012-11-04 05:28:392012-11-04 02:02:562012-11-04 02:39:212012-11-04 03:02:572012-11-04 03:39:402012-11-04 04:02:482012-11-04 04:39:032012-11-04 04:39:382012-11-04 05:03:002012-11-03 11:29:132012-11-03 12:15:042012-11-03 13:11:162012-11-03 13:15:272012-11-03 15:08:462012-11-03 15:17:552012-11-03 15:19:222012-11-03 15:34:222012-11-04 00:49:032012-11-04 00:52:562012-11-04 04:58:452012-11-03 18:53:522012-11-03 19:25:232012-11-03 20:25:202012-11-03 20:54:012012-11-03 21:22:102012-11-03 21:50:232012-11-03 23:23:072012-11-04 00:09:352012-11-04 00:56:422012-11-04 01:43:112012-11-03 10:38:432012-11-03 11:01:342012-11-03 12:17:262012-11-03 12:54:052012-11-03 16:46:282012-11-03 16:47:412012-11-03 16:48:252012-11-03 22:27:122012-11-03 22:28:142012-11-03 22:30:352012-11-03 22:32:012012-11-03 22:41:062012-11-03 22:42:182012-11-03 22:51:322012-11-04 08:36:512012-11-04 08:45:282012-11-04 09:00:29root@kali:~/python/laowangpy/datadig#
python时间模块操作
时间戳生成与取特定的年份与月份值
将字符串的时间转换为时间戳
方法:
a = "2013-10-10 23:40:00"
将其转换为时间数组
import time
timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")
转换为时间戳:
timeStamp = int(time.mktime(timeArray))
timeStamp == 1381419600
root@kali:~/python/laowangpy/datadig# python
Python 2.7.3 (default, Mar 14 2014, 11:57:14)
[GCC 4.7.2] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import time
>>> print time.time()
1508336495.91
>>> a = '2011-11-12 23:40:45'
>>> timearry =time.strptime(a,"%Y-%m-%d %H:%M:%S")
>>> print timearry
time.struct_time(tm_year=2011, tm_mon=11, tm_mday=12, tm_hour=23, tm_min=40, tm_sec=45, tm_wday=5, tm_yday=316, tm_isdst=-1)
>>> print timearry
time.struct_time(tm_year=2011, tm_mon=11, tm_mday=12, tm_hour=23, tm_min=40, tm_sec=45, tm_wday=5, tm_yday=316, tm_isdst=-1)
>>> timestamp = int(time.mktime(timearry))
>>> print timestamp
1321112445
>>> time.ctime()
'Wed Oct 18 22:41:39 2017'
>>> time.ctime(1321112445)
'Sat Nov 12 23:40:45 2011'
>>> time.gmtime()
time.struct_time(tm_year=2017, tm_mon=10, tm_mday=18, tm_hour=14, tm_min=42, tm_sec=26, tm_wday=2, tm_yday=291, tm_isdst=0)
>>> time.gmtime(1321112445)
time.struct_time(tm_year=2011, tm_mon=11, tm_mday=12, tm_hour=15, tm_min=40, tm_sec=45, tm_wday=5, tm_yday=316, tm_isdst=0)
>>> time.localtime()
time.struct_time(tm_year=2017, tm_mon=10, tm_mday=18, tm_hour=22, tm_min=43, tm_sec=0, tm_wday=2, tm_yday=291, tm_isdst=0)
>>> time.localtime(1321112445)
time.struct_time(tm_year=2011, tm_mon=11, tm_mday=12, tm_hour=23, tm_min=40, tm_sec=45, tm_wday=5, tm_yday=316, tm_isdst=0)
>>>
>>> print time.localtime(1321112445)[0]
2011
>>> print time.localtime(1321112445)[1]
11
>>> print time.localtime(1321112445)[0]
2011
>>>
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2、有哪几天的数据
目标文件:百度网盘
http://pan.baidu.com/s/1kU6X2GB
所求问题为:该文本里,有哪几天的数据? (要求:输出为一个list,例:[‘2012-03-04’,’2012-03-05’])
python源码:
root@kali:~/python/laowangpy/datadig# cat 4adddaysdata.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import string
import time
text_userdata = open('twitterdata.txt').read()#对目标文本进行全部读取,并赋值
#print text_userdata
l = [n for n in text_userdata.split('\r\n')]#对目标文件使用\r\n进行分割,把分割好的每个元素加入列表n中
#print l[0]#打印第一个元素信息
i = 0
n = []
m = []
while l[i] != "":#当列表l中的元素不等于空白,则进入
s = l[i].replace('"','')#对列表l中每个元素中的"符号使用空白替换
n.append(s)#加入列表n中
t = s.split(",")#对列表s使用,分割
m.append(t[6])#对列表t中第6个元素加入列表m中
f = open("test.txt","a")#打开文件test.txt
f.write(m[i])#对列表m每个元素写入文件test.txt中
f.close()
i = i + 1#循环加1
#print m[0]#打印列表m的第一个元素
j = 0
yeararry = []
timestamparry = []
while m[j] != m[-1]:#对列表m循环元素时不等于最后一个呀元素是,则进入
timearry = time.strptime(m[j], "%Y-%m-%d %H:%M:%S")#对列表m每个元素转换成时间格式
timestamp = int(time.mktime(timearry))#对时间格式元素转成时间戳
timestamparry.append(timestamp)#对没有转成时间戳元素加入列表timestamparry
year = time.localtime(timestamparry[j])[0]#对每个时间戳的元素取出第一个年份的值
month = time.localtime(timestamparry[j])[1]#对每个时间戳的元素取出第二个月份的值
day = time.localtime(timestamparry[j])[2]#对每个时间戳的元素取出第三个天数的值
yeararry.append(str(year)+"-"+str(month)+"-"+str(day))#对取出的整型年份值、整型月份值和整型天数值,强制转成字符型,再组合,再加入列表yeararry
j = j + 1
fredata = [(x,yeararry.count(x)) for x in set(yeararry)]#对列表yeararry中每个元素出现频次统计
print fredata
print "--------------------------按出现的频率统计如下:--------------------------------"
sortfredata = sorted(fredata)#对列表fredata转换成集合sortfredata
print sorted(sortfredata,key=lambda x:x[1],reverse=True)#对集合sortfredata中使用第二个key值大小,并从大到小排序
root@kali:~/python/laowangpy/datadig#
Python运行情况:((‘2013-10-29’, 1)中的数字1是在该文本日志中出现的频次)
root@kali:~/python/laowangpy/datadig# python 4adddaysdata.py
[('2014-11-3', 3), ('2013-10-29', 1), ('2014-11-4', 1), ('2013-11-3', 10), ('2013-11-2', 1), ('2013-11-4', 8), ('2014-12-1', 1), ('2013-10-4', 1), ('2012-10-30', 4), ('2013-12-4', 1), ('2012-11-4', 48), ('2014-12-3', 1), ('2012-11-1', 2), ('2012-11-2', 19), ('2012-11-3', 55)]
--------------------------按出现的频次的高低排序统计如下:--------------------------------
[('2012-11-3', 55), ('2012-11-4', 48), ('2012-11-2', 19), ('2013-11-3', 10), ('2013-11-4', 8), ('2012-10-30', 4), ('2014-11-3', 3), ('2012-11-1', 2), ('2013-10-29', 1), ('2013-10-4', 1), ('2013-11-2', 1), ('2013-12-4', 1), ('2014-11-4', 1), ('2014-12-1', 1), ('2014-12-3', 1)]
root@kali:~/python/laowangpy/datadig#
——————————————————————————————————————————————
3、哪个小时发布的数据最多
目标文件:百度网盘
http://pan.baidu.com/s/1kU6X2GB
所求问题为:该文本里,在哪个小时发布的数据最多? (要求:输出一个整数。)
python源码:
root@kali:~/python/laowangpy/datadig# cat 5whichhours.py
#!/usr/bin/python
# --*-- coding:utf-8 --*--
import string
import time
text_userdata = open('twitterdata.txt').read()#对目标文本进行全部读取,并赋值
#print text_userdata
l = [n for n in text_userdata.split('\r\n')]#对目标文件使用\r\n进行分割,把分割好的每个元素加入列表n中
#print l[0]#打印第一个元素信息
i = 0
n = []
m = []
while l[i] != "":#当列表l中的元素不等于空白,则进入
s = l[i].replace('"','')#对列表l中每个元素中的"符号使用空白替换
n.append(s)#加入列表n中
t = s.split(",")#对列表s使用,分割
m.append(t[6])#对列表t中第6个元素加入列表m中
f = open("test.txt","a")#打开文件test.txt
f.write(m[i])#对列表m每个元素写入文件test.txt中
f.close()
i = i + 1#循环加1
#print m[0]#打印列表m的第一个元素
j = 0
yeararry = []
timestamparry = []
while m[j] != m[-1]:#对列表m循环元素时不等于最后一个呀元素是,则进入
timearry = time.strptime(m[j], "%Y-%m-%d %H:%M:%S")#对列表m每个元素转换成时间格式
timestamp = int(time.mktime(timearry))#对时间格式元素转成时间戳
timestamparry.append(timestamp)#对没有转成时间戳元素加入列表timestamparry
#year = time.localtime(timestamparry[j])[0]#对每个时间戳的元素取出第一个年份的值
#month = time.localtime(timestamparry[j])[1]#对每个时间戳的元素取出第二个月份的值
#day = time.localtime(timestamparry[j])[2]#对每个时间戳的元素取出第三个天数的值
hour = time.localtime(timestamparry[j])[3]#对每个时间戳的元素取出第四个小时的值
yeararry.append(str(hour))#对取出的整型小时数值,强制转成字符型,再组合,再加入列表yeararry
j = j + 1
fredata = [(x,yeararry.count(x)) for x in set(yeararry)]#对列表yeararry中每个元素出现频次统计
print fredata
print "--------------------------按出现的频次高低排序统计如下:--------------------------------"
sortfredata = sorted(fredata)#对列表fredata转换成集合sortfredata
print sorted(sortfredata,key=lambda x:x[1],reverse=True)#对集合sortfredata中使用第二个key值大小,并从大到小排序
root@kali:~/python/laowangpy/datadig#
python运行情况:((‘22’, 17)中第一数字22为22点的时间值,第二个数字17为在该文本中出现17次)
root@kali:~/python/laowangpy/datadig# python 5whichhours.py
[('20', 4), ('21', 3), ('22', 17), ('23', 3), ('1', 3), ('0', 13), ('3', 5), ('2', 10), ('5', 13), ('4', 7), ('7', 7), ('6', 10), ('9', 3), ('8', 3), ('11', 6), ('10', 7), ('13', 11), ('12', 7), ('15', 5), ('14', 2), ('17', 1), ('16', 9), ('19', 6), ('18', 1)]
--------------------------按出现的频次高低排序统计如下:--------------------------------
[('22', 17), ('0', 13), ('5', 13), ('13', 11), ('2', 10), ('6', 10), ('16', 9), ('10', 7), ('12', 7), ('4', 7), ('7', 7), ('11', 6), ('19', 6), ('15', 5), ('3', 5), ('20', 4), ('1', 3), ('21', 3), ('23', 3), ('8', 3), ('9', 3), ('14', 2), ('17', 1), ('18', 1)]
root@kali:~/python/laowangpy/datadig#