def get_data(url):# 获取url
file_name = url.split("gdeltv2/")[1].split(".zip")[0]
r = requests.get(url)
temp_file=open("./temp.zip", "wb")
temp_file.write(r.content)
temp_file.close()
try:
my_zip=zipfile.ZipFile('./temp.zip','r')
my_zip.extract(file_name,path="./data")
my_zip.close()
except Exception:
print("%s not exist" % file_name)
return None
def get_data_df(): #日期读取
f=open("date.txt")
date=[]
time=[]
for i in f.readlines():
date.append(i.strip("\n"))
f.close()
f=open("time.txt")
for i in f.readlines():
time.append(i.strip("\n"))
f.close()
#地址整合
url1 = "http://data.gdeltproject.org/gdeltv2/%s.export.CSV.zip"
url2 = "http://data.gdeltproject.org/gdeltv2/%s.mentions.CSV.zip"
for i in date:
for j in time:
str_real_time=i+j
get_data(url1%str_real_time)
get_data(url2%str_real_time)
print("%s-complete"%i)
import os
import shutil
for i in range(1,8):
src_folder="./totaldata/20220"+str(i)
tar_folder="./totaldata/20220"+str(i)
files=os.listdir(src_folder)
for file in files:
src_path=src_folder+'/'+file
for file in files:
# 将每个文件的完整路径拼接出来
src_path = src_folder + '/' + file
if os.path.isfile(src_path):
tar_path = tar_folder + '/' + file.split('.')[-2]
print(tar_path)
# 如果文件夹不存在则创建
if not os.path.exists(tar_path):
os.mkdir(tar_path)
# 移动文件
shutil.move(src_path, tar_path)
os.chdir(Folder_Path)
file_list=os.listdir()
for i in range(1,len(file_list)):
df=pd.read_csv(file_list[i],sep='\t')
df.to_csv(SaveFile_Path+"/"+SaveFile_Name,encoding="utf_8_sig",
index=False,header=None,mode='a+')
sys.stdout.write("\r已合并:%.2f%%"%float((i/len(file_list))*100))
sys.stdout.flush()
is_null=df_01.isnull().sum().sort_values(ascending=False)
is_null[is_null>row*0.85]#筛选出空值数量大于85%的数据
drop_columns=['Actor2Type3Code','Actor1Type3Code','Actor2Religion2Code',
'Actor1Religion2Code','Actor2EthnicCode','Actor1EthnicCode',
'Actor2Religion1Code','Actor2KnownGroupCode','Actor1Religion1Code',
'Actor1KnownGroupCode','Actor2Type2Code','Actor1Type2Code']
df_01.drop(drop_columns,axis=1,inplace=True)
得到结果如下:
这里值得注意的是,很多字段在后续分析中没有用到,但还是导入进去了,为了和元数据保持一致性。
CREATE TABLE `export` (
`GLOBALEVENTID` int NOT NULL,
`SQLDATE` bigint,
`MonthYear` bigint,
`Year` bigint,
`FractionDate` bigint,
`Actor1Code` varchar(255),
`Actor1Name` varchar(255),
`Actor1CountryCode` varchar(255),
`Actor1Type1Code` varchar(255),
`Actor2Code` varchar(255),
`Actor2Name` varchar(255),
`Actor2CountryCode` varchar(255),
`Actor2Type1Code` varchar(255),
`IsRootEvent` varchar(255),
`EventCode` varchar(255),
`EventBaseCode` varchar(255),
`EventRootCode` varchar(255),
`QuadClass` int,
`GoldsteinScale` double,
`NumMentions` int,
`NumSources` int,
`NumArticles` int,
`AvgTone` double,
`Actor1Geo_Type` varchar(255),
`Actor1Geo_FullName` varchar(255),
`Actor1Geo_CountryCode` varchar(255),
`Actor1Geo_ADM1Code` varchar(255),
`Actor1Geo_ADM2Code` varchar(255),
`Actor1Geo_Lat` double,
`Actor1Geo_Long` double,
`Actor1Geo_FeatureID` varchar(255),
`Actor2Geo_Type` varchar(255),
`Actor2Geo_FullName` varchar(255),
`Actor2Geo_CountryCode` varchar(255),
`Actor2Geo_ADM1Code` varchar(255),
`Actor2Geo_ADM2Code` varchar(255),
`Actor2Geo_Lat` double,
`Actor2Geo_Long` double,
`Actor2Geo_FeatureID` varchar(255),
`ActionGeo_Type` varchar(255),
`ActionGeo_FullName` varchar(255),
`ActionGeo_CountryCode` varchar(255),
`ActionGeo_ADM1Code` varchar(255),
`ActionGeo_ADM2Code` varchar(255),
`ActionGeo_Lat` double,
`ActionGeo_Long` double,
`ActionGeo_FeatureID` varchar(255),
`DATEADDED` bigint,
`SOURCEURL` text,
PRIMARY KEY (`GLOBALEVENTID`)
);
LOAD DATA INFILE 'E:/term/code/mergedata/export/export_202201.csv' INTO TABLE
`export`
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\r\n'
IGNORE 1 ROWS;
DELETE
FROM rus_and_ukr
WHERE `MonthYear`<202201
SELECT COUNT(GLOBALEVENTID) FROM export -- 21504131
接下来将针对俄乌冲突进行筛选查询和可视化~