2018-05-14
方式过于粗暴,这里是为了记录,切勿模仿
Synset存入
采用简单暴力的方式:通过数据格式调整,将Synset数据拼接进cypher语句,使用拼接后的语句存入数据.
create ( :Synset { ID:"bn:00006329n",MainSense:['WN:EN:Asia'],POS:['NOUN'],iskeyConcept:['false'],Sense:['WN:EN:Asia', 'OMWN_ZH:ZH:亚洲', 'GEONM:EN:Asia', 'GEONM:VI:Châu_Á', 'GEONM:ZH:亚洲', 'WIKI:EN:Asia', 'WIKI:VI:Châu_Á', 'WIKI:ZH:亚洲', 'WIKIDATA:EN:Asia', 'WIKIDATA:VI:châu_Á', 'WIKIDATA:EN:III_JX_es_el_presidente_de_asia', 'WIKIDATA:VI:Á', 'WIKIDATA:VI:Á_châu', 'WIKIDATA:ZH:亚洲', 'WIKIDATA:ZH:亚细亚', 'OMWIKI:EN:Asia', 'OMWIKI:VI:Châu_Á', 'OMWIKI:ZH:亚洲', 'OMWIKI:ZH:亞洲', 'WIKT:EN:Asia', 'WIKT:VI:Châu_Á_(洲亞)', 'WIKT:VI:Á_Châu_(亞洲)', 'WIKIQU:EN:Asia', 'WIKIRED:ZH:Asia', 'WIKIRED:EN:ASIA', 'WIKIRED:EN:Asia_Major', 'WIKIRED:EN:Asian_continent', 'WIKIRED:EN:Continental_Asia', 'WIKIRED:EN:Name_of_Asia', 'WIKIRED:VI:Á_châu', 'WIKIRED:VI:Á_Châu', 'WIKIRED:VI:Á_Tế_Á', 'WIKIRED:EN:Азия', 'WIKIRED:EN:亚', 'WIKIRED:ZH:亚洲大陆', 'WIKIRED:ZH:亚细亚', 'WIKIRED:ZH:亚细亚洲', 'WIKIRED:EN:亜', 'WIKIRED:EN:亞', 'WIKIRED:ZH:亞洲', 'WIKIRED:ZH:亞洲區', 'WIKIRED:ZH:亞西亞', 'WIKIRED:EN:Asia_(continent)', 'WNTR:VI:á', 'WNTR:ZH:亚_洲'],glosses:["The largest continent with 60% of the earth's population; it is joined to Europe on the west to form Eurasia; it is the site of some of the world's earliest civilizations", "Asia is Earth's largest and most populous continent, located primarily in the Eastern and Northern Hemispheres.", 'Especially Malaysia and formerly China when they were held; Asian Festival of Speed', "Continent in Earth''s Eastern Hemisphere", "Continent, mainly on the Earth's Eastern Hemisphere", "The world's largest continent. It occupies the eastern part of the Eurasian landmass and its adjacent islands and is separated from Europe by the Ural Mountains. Asia borders on the Arctic Ocean, the Pacific Ocean, the Indian Ocean, and the Mediterranean and Red Seas in the west. It includes the largest peninsulas of Asia Minor, India, Arabia, and Indochina and the island groups of Japan, Indonesia, the Philippines, and Ceylon.", 'A female given name of modern usage, from the continent.', 'The continent of Asia.', "Asia is the world's largest and most populous continent, located primarily in the eastern and northern hemispheres."],Category:['BNCAT:EN:Asia', 'BNCAT:VI:Châu_Á', 'BNCAT:EN:Continents', 'BNCAT:VI:Lục_địa', 'BNCAT:ZH:世界之最', 'BNCAT:ZH:亚洲', 'BNCAT:ZH:大洲']})
create ( :Synset { ID:"bn:15074344n",MainSense:['WIKI:EN:Cà_Mau_Province'],POS:['NOUN'],iskeyConcept:['false'],Sense:['GEONM:EN:Ca_Mau_province', 'WIKI:VI:Cà_Mau', 'WIKI:EN:Cà_Mau_Province', 'WIKI:ZH:金甌省', 'WIKIDATA:EN:Ca_Mau', 'WIKIDATA:EN:Cà_Mau', 'WIKIDATA:VI:Cà_Mau', 'WIKIDATA:EN:Cà_Mau_Province', 'WIKIDATA:VI:Tỉnh_Cà_Mau', 'WIKIDATA:ZH:歌毛省', 'WIKIDATA:ZH:金甌省', 'WIKIRED:EN:Ca_Ma_Province', 'WIKIRED:EN:Ca_Ma_Province,_Vietnam', 'WIKIRED:EN:Ca_Mau_Province', 'WIKIRED:EN:Ca_Mau_province', 'WIKIRED:EN:Cau_Mau', 'WIKIRED:EN:Cà_Ma_Province', 'WIKIRED:EN:Cà_Ma_Province,_Vietnam', 'WIKIRED:EN:Cà_Mau_province', 'WIKIRED:VI:Tỉnh_Cà_Mau', 'WIKIRED:ZH:哥毛省', 'WIKIRED:ZH:歌毛省', 'WIKIRED:ZH:金瓯省', 'WIKITR:VI:tỉnh_cà_mau', 'WIKITR:ZH:金_瓯_省', 'BABELNET:ZH:Ca_Mau'],glosses:['Cà Mau is a province of Vietnam, named after its capital city.', 'Province of Vietnam'],Category:['BNCAT:VI:Cà_Mau', 'BNCAT:EN:Ca_Mau_Province', 'BNCAT:EN:Gulf_of_Thailand', 'BNCAT:VI:Huyện_Cà_Mau', 'BNCAT:EN:Territorial_disputes_of_Cambodia', 'BNCAT:EN:Territorial_disputes_of_Vietnam', 'BNCAT:VI:Từ_gốc_Khmer', 'BNCAT:VI:Vịnh_Thái_Lan', 'BNCAT:ZH:金甌省']})
create ( :Synset { ID:"bn:16443124n",MainSense:['WIKI:EN:Tiền_Giang_Province'],POS:['NOUN'],iskeyConcept:['false'],Sense:['GEONM:EN:Tien_Giang_province', 'WIKI:VI:Tiền_Giang', 'WIKI:EN:Tiền_Giang_Province', 'WIKI:ZH:前江省', 'WIKIDATA:EN:Tien_Giang', 'WIKIDATA:EN:Tiền_Giang', 'WIKIDATA:VI:Tiền_Giang', 'WIKIDATA:EN:Tiền_Giang_province', 'WIKIDATA:VI:Tỉnh_Tiền_Giang', 'WIKIDATA:ZH:前江', 'WIKIDATA:ZH:前江省', 'WIKIDATA:ZH:定祥', 'WIKIRED:EN:Dinh_Tuong_Province', 'WIKIRED:EN:Tien_Giang', 'WIKIRED:EN:Tien_Giang_Province', 'WIKIRED:EN:Tien_Giang_province', 'WIKIRED:EN:Tien_Giang_Province,_Vietnam', 'WIKIRED:EN:Tiền_Giang', 'WIKIRED:EN:Tiền_Giang_province', 'WIKIRED:EN:Tiền_Giang_Province,_Vietnam', 'WIKIRED:VI:Tỉnh_Tiền_Giang', 'WIKIRED:EN:前江', 'WIKIRED:ZH:前江', 'WIKIRED:ZH:定祥', 'WIKIRED:VI:Tiền_Giang_(tỉnh)', 'WIKITR:VI:tỉnh_tiền_giang', 'BABELNET:ZH:Tien_Giang'],glosses:['Tiền Giang is a province in the Mekong Delta region of southern Vietnam.', 'Province of Vietnam'],Category:['BNCAT:VI:Đồng_bằng_sông_Cửu_Long', 'BNCAT:EN:Territorial_disputes_of_Cambodia', 'BNCAT:EN:Territorial_disputes_of_Vietnam', 'BNCAT:VI:Tiền_Giang', 'BNCAT:EN:Tiền_Giang_Province', 'BNCAT:ZH:前江省']})
......
太粗暴了~~~虽然只有160条Synset数据,但仍然导致neo4j直接卡死。作为妥协删除了每个Synset中的Image属性数据(包含条目最多且每个条目都很长的一个属性)————>成功存入
约束还是要象征性的加一下的:
create constraint on (e:Synset) Assert e.ID is Unique
Synset存完当然是要存入Relation
初步获取的数据:
bn:00003341n Hyponym bn:02796463n
bn:00003341n Hyponym bn:03860877n
bn:00003341n Hyponym bn:03728916n
bn:00003341n has_quality bn:14748597n
bn:00003341n motto bn:02598357n
bn:00003341n executive_body bn:00033909n
bn:00003341n executive_body bn:00078848n
bn:00003341n office_held_by_head_of_government bn:00018323n
bn:00003341n seal_description bn:00041596n
bn:00003341n diplomatic_relation bn:00000536n
bn:00003341n diplomatic_relation bn:00001732n
bn:00003341n diplomatic_relation bn:00001803n
bn:00003341n diplomatic_relation bn:00002362n
数据格式为:SynsetID1 Relation SynsetID2
需要特别注意的是,这里的Synset包含大量之前未保存的
提取所有SynsetID 以MERGE的方式存入数据库
用正则提取id并拼接成cypher的MERGE语句
line = infile.readline() ##按行读取
while line !="":
ids = re.findall('\s+(\w+:\d+\w)',line)
print(ids)
outfile.write("MERGE ( :Synset { ID:\""+ids[0]+"\"})\n")
line = infile.readline()
生成了包含一万多条MERGE语句的文档,一次复制粘贴至NEO4J的brower.提示内存不足。于是将10000+条语句进行划分,500条一次,存入成功
当然,正确的做法应该是在程序循环中生成语句,一条一条循环插入。因为还没有研究NEO4J的python/java接口,所以采用这种粗暴的方法*
最后,同样粗暴的生成插入关系的cypher语句。完成数据存入尝试。
create ( :Synset { ID:"bn:16443124n",MainSense:['WIKI:EN:Tiền_Giang_Province'],POS:['NOUN'],iskeyConcept:['false'],Sense:['GEONM:EN:Tien_Giang_province', 'WIKI:VI:Tiền_Giang', 'WIKI:EN:Tiền_Giang_Province', 'WIKI:ZH:前江省', 'WIKIDATA:EN:Tien_Giang', 'WIKIDATA:EN:Tiền_Giang', 'WIKIDATA:VI:Tiền_Giang', 'WIKIDATA:EN:Tiền_Giang_province', 'WIKIDATA:VI:Tỉnh_Tiền_Giang', 'WIKIDATA:ZH:前江', 'WIKIDATA:ZH:前江省', 'WIKIDATA:ZH:定祥', 'WIKIRED:EN:Dinh_Tuong_Province', 'WIKIRED:EN:Tien_Giang', 'WIKIRED:EN:Tien_Giang_Province', 'WIKIRED:EN:Tien_Giang_province', 'WIKIRED:EN:Tien_Giang_Province,_Vietnam', 'WIKIRED:EN:Tiền_Giang', 'WIKIRED:EN:Tiền_Giang_province', 'WIKIRED:EN:Tiền_Giang_Province,Vietnam', 'WIKIRED:VI:Tỉnh_Tiền_Giang', 'WIKIRED:EN:前江', 'WIKIRED:ZH:前江', 'WIKIRED:ZH:定祥', 'WIKIRED:VI:Tiền_Giang(tỉnh)', 'WIKITR:VI:tỉnh_tiền_giang', 'BABELNET:ZH:Tien_Giang'],glosses:['Tiền Giang is a province in the Mekong Delta region of southern Vietnam.', 'Province of Vietnam'],Category:['BNCAT:VI:Đồng_bằng_sông_Cửu_Long', 'BNCAT:EN:Territorial_disputes_of_Cambodia', 'BNCAT:EN:Territorial_disputes_of_Vietnam', 'BNCAT:VI:Tiền_Giang', 'BNCAT:EN:Tiền_Giang_Province', 'BNCAT:ZH:前江省']})
create ( :Synset { ID:"bn:03506549n",MainSense:['WIKI:EN:Emblem_of_Vietnam'],POS:['NOUN'],iskeyConcept:['false'],Sense:['WIKI:EN:Emblem_of_Vietnam', 'WIKI:VI:Quốc_huy_Việt_Nam', 'WIKI:ZH:越南國徽', 'WIKIDATA:EN:Emblem_of_Vietnam', 'WIKIDATA:VI:Quốc_huy_của_Việt_Nam', 'WIKIDATA:VI:Quốc_huy_Việt_Nam', 'WIKIDATA:ZH:越南國徽', 'WIKIRED:EN:Coat_of_arms_of_South_Vietnam', 'WIKIRED:EN:Coat_of_arms_of_Viet_Nam', 'WIKIRED:EN:Coat_of_Arms_of_Viet_Nam', 'WIKIRED:EN:Coat_of_arms_of_vietnam', 'WIKIRED:EN:Coat_of_arms_of_Vietnam', 'WIKIRED:EN:Emblem_of_South_Vietnam', 'WIKIRED:VI:Quốc_huy_của_Việt_Nam', 'WIKIRED:VI:Quốc_huy_việt_nam', 'WIKIRED:ZH:越南国徽'],glosses:['The emblem of Vietnam is circular, has red background and a yellow star in the middle which represent the Communist Party of Vietnam, the revolutionary history and bright future of Vietnam.', 'Coat of arms'],Category:['BNCAT:VI:Biểu_tượng_của_Việt_Nam', 'BNCAT:EN:Coats_of_arms_of_communist_states', 'BNCAT:EN:Coats_of_arms_with_cogwheels', 'BNCAT:EN:Coats_of_arms_with_rice', 'BNCAT:EN:Coats_of_arms_with_stars', 'BNCAT:EN:Coats_of_arms_with_wheat', 'BNCAT:EN:Heraldry_stubs', 'BNCAT:EN:National_emblems', 'BNCAT:EN:National_symbols_of_Vietnam', 'BNCAT:VI:Quốc_huy', 'BNCAT:VI:Tác_phẩm_1976', 'BNCAT:EN:Vietnam_stubs', 'BNCAT:ZH:国徽', 'BNCAT:ZH:越南國家象徵']})
create ( :Synset { ID:"bn:03663227n",MainSense:['WIKI:EN:Government_of_Vietnam'],POS:['NOUN'],iskeyConcept:['false'],Sense:['WIKI:VI:Chính_phủ_Việt_Nam', 'WIKI:EN:Government_of_Vietnam', 'WIKIDATA:VI:Chính_phủViệt_Nam', 'WIKIDATA:VI:Chính_quyền_Việt_Nam', 'WIKIDATA:EN:Government_of_Vietnam', 'WIKIDATA:VI:Hộiđồng_Bộ_trưởng_Việt_Nam', 'WIKIDATA:EN:Vietnamese_Government', 'WIKIRED:EN:Cabinet_of_Vietnam', 'WIKIRED:VI:Chính_phủ_CHXHCN_Việt_Nam', 'WIKIRED:VI:Chính_phủ_nước_Cộng_hòa_xãhội_chủnghĩa_Việt_Nam', 'WIKIRED:VI:Chính_quyền_Việt_Nam', 'WIKIRED:EN:Council_of_Ministers_of_Vietnam', 'WIKIRED:VI:Hộiđồng_Bộtrưởng_Việt_Nam', 'WIKIRED:EN:Vietnamese_Council_of_Ministers', 'WIKIRED:EN:Vietnamese_government', 'WIKIRED:EN:Council_of_Ministers(Vietnam)', 'WIKIRED:VI:Thành_viên_Chính_phủ(Việt_Nam)'],glosses:['The Government of Vietnam is the executive arm of the Vietnamese state, and the members of the Government are elected by the Nation