0.前言
最近写论文用到了比较出名的DBLP数据集,详细介绍见DBLP官网
DBLP数据集下载地址
DBLP是以XML的形式给出的,类似于
Sanjeev Saxena
Parallel Integer Sorting and Simulation Amongst CRCW Models.
607-619
1996
33
Acta Inf.
7
db/journals/acta/acta33.html#Saxena96
https://doi.org/10.1007/BF03036466
......
此处将实验数据处理的过程整理如下:
1.需求
DBLP现在的数据量太大了(截止到2018-07-12 22:38,2G+),根本用不了那么多,而且在XML中不方便处理。
现在的需求是:
根据需求将一定量的数据导入到数据库(此处以mysql为例)
2.处理
经分析不难想到,问题的解决分为两步:
1.按照需求将文件截取一部分(
有人也许会说,这都是废话,谁都知道,单独列为一步, 是因为一般的编辑器,还真打不开2G+的xml
)
2.xml导入数据库
2.1截取部分文件
截取文件,我推荐的软件是UltraEdit,至于如何破解,此处不做过多赘述
然后截取你需要的数据拷贝到新的文件里即可,需要注意的是:
新文件
的末尾别忘了拷贝结束标签
2.2数据导入数据库
技术选型:
java的DM4J
+mysql
(1)数据库建表
/*
Navicat Premium Data Transfer
Source Server Type : MySQL
Source Server Version : 50527
Source Host : localhost:3306
Source Schema : mypaper
Target Server Type : MySQL
Target Server Version : 50527
File Encoding : 65001
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for paper
-- ----------------------------
DROP TABLE IF EXISTS `paper`;
CREATE TABLE `paper` (
`id` int(32) UNSIGNED ZEROFILL NOT NULL AUTO_INCREMENT,
`author` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`title` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`mdate` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`key` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`pages` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`year` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`volume` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`journal` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`number` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`url` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`ee` varchar(512) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 243354 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;
SET FOREIGN_KEY_CHECKS = 1;
(2)java处理
实体类:
package org.stone6762.domain;
/**
* @ClassName:MyPaper
* @author Stone6762
* @Description:
*/
public class MyPaper {
private String author;
private String title;
private String mdate;
private String key;
private String pages;
private String year;
private String volume;
private String journal;
private String number;
private String url;
private String ee;
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getMdate() {
return mdate;
}
public void setMdate(String mdate) {
this.mdate = mdate;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getPages() {
return pages;
}
public void setPages(String pages) {
this.pages = pages;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getVolume() {
return volume;
}
public void setVolume(String volume) {
this.volume = volume;
}
public String getJournal() {
return journal;
}
public void setJournal(String journal) {
this.journal = journal;
}
public String getNumber() {
return number;
}
public void setNumber(String number) {
this.number = number;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEe() {
return ee;
}
public void setEe(String ee) {
this.ee = ee;
}
@Override
public String toString() {
return "MyPaper [author=" + author + ", title=" + title + ", mdate=" + mdate + ", key=" + key + ", pages="
+ pages + ", year=" + year + ", volume=" + volume + ", journal=" + journal + ", number=" + number
+ ", url=" + url + ", ee=" + ee + "]";
}
}
xml解析工具
package org.stone6762.utils;
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.stone6762.domain.MyPaper;
/**
* @Title:XML2Bean.java
* @author Stone6762
* @Description:
*/
public class XML2Bean {
public static List getPapersFromXML(File file) {
List paperList = null;
MyPaper paper = null;
SAXReader reader = new SAXReader();
try {
Document document = reader.read(file);
Element paperstore = document.getRootElement();
Iterator storeit = paperstore.elementIterator();
paperList = new ArrayList();
while (storeit.hasNext()) {
paper = new MyPaper();
Element paperElement = (Element) storeit.next();
// 遍历paperElement的属性
List attributes = paperElement.attributes();
for (Attribute attribute : attributes) {
if (attribute.getName().equals("mdate")) {
String mdate = attribute.getValue();
paper.setMdate(mdate);
}
if (attribute.getName().equals("key")) {
String key = attribute.getValue();
paper.setKey(key);
}
}
Iterator paperit = paperElement.elementIterator();
while (paperit.hasNext()) {
Element child = (Element) paperit.next();
String nodeName = child.getName();
if (nodeName.equals("author")) {
String author = child.getStringValue();
paper.setAuthor(author);
} else if (nodeName.equals("title")) {
String title = child.getStringValue();
paper.setTitle(title);
} else if (nodeName.equals("pages")) {
String pages = child.getStringValue();
paper.setPages(pages);
} else if (nodeName.equals("year")) {
String year = child.getStringValue();
paper.setYear(year);
} else if (nodeName.equals("volume")) {
String volume = child.getStringValue();
paper.setVolume(volume);
} else if (nodeName.equals("journal")) {
String journal = child.getStringValue();
paper.setJournal(journal);
} else if (nodeName.equals("number")) {
String number = child.getStringValue();
paper.setNumber(number);
} else if (nodeName.equals("url")) {
String url = child.getStringValue();
paper.setUrl(url);
} else if (nodeName.equals("ee")) {
String ee = child.getStringValue();
paper.setEe(ee);
}
}
paperList.add(paper);
paper = null;
}
} catch (DocumentException e) {
e.printStackTrace();
}
return paperList;
}
}
导入数据库
package org.stone6762.main;
import java.io.File;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.List;
import org.stone6762.domain.MyPaper;
import org.stone6762.utils.XML2Bean;
/**
* @ClassName:Main
* @author Stone6762
* @Description:
*/
public class Main {
private static String USERNAME = "root";
private static String PASSWORD = "root";
private static String DRIVER = "com.mysql.jdbc.Driver";
private static String URL = "jdbc:mysql://localhost:3306/mypaper";
private static Connection con = null;
private static PreparedStatement pstmt = null;
public static void main(String[] args) throws Exception {
Class.forName(DRIVER);
System.out.println("注册驱动成功!");
con = DriverManager.getConnection(URL, USERNAME, PASSWORD);
System.out.println("获取连接成功!");
String sql = "INSERT INTO paper (paper.author, "
+ "paper.ee,"
+ "paper.journal,"
+ "paper.key,"
+ "paper.mdate,"
+ "paper.number,"
+ "paper.pages,"
+ "paper.title,"
+ "paper.url,"
+ "paper.volume,"
+ "paper.year )VALUES (?,?,?,?,?,?,?,?,?,?,? )";
pstmt = con.prepareStatement(sql);
List papers = XML2Bean.getPapersFromXML(new File("ref/DBCP_1_1.xml"));
for (int i = 0; i < papers.size(); i++) {
MyPaper myPaper = papers.get(i);
pstmt.setObject(1, myPaper.getAuthor());
pstmt.setObject(2, myPaper.getEe());
pstmt.setObject(3, myPaper.getJournal());
pstmt.setObject(4, myPaper.getKey());
pstmt.setObject(5, myPaper.getMdate());
pstmt.setObject(6, myPaper.getNumber());
pstmt.setObject(7, myPaper.getPages());
pstmt.setObject(8, myPaper.getTitle());
pstmt.setObject(9, myPaper.getUrl());
pstmt.setObject(10, myPaper.getVolume());
pstmt.setObject(11, myPaper.getYear());
pstmt.executeUpdate();
System.out.println(i+"插入成功!");
}
}
}