hadoop经典wordcount和hdfs增加删除追加

学习hadoop的经典实例,网上找了好多代码,自己做了些修改,这个例子有hadoop的wordcount和操作hdfs的全过程,已跑通。

1.RunJob。java

package com.dinfo.hadoop;


import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobTracker;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.YARNRunner;
import org.apache.hadoop.mapreduce.Cluster.JobTrackerStatus;
import org.apache.hadoop.mapreduce.ClusterMetrics;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TaskTrackerInfo;






import com.dinfo.oec.had.common.ConfigurationCommon;


public class RunJob {


public static void main(String[] args) throws IOException, InterruptedException {
Configuration config = ConfigurationCommon.conf;

/*config.set("fs.default.name", "hdfs://192.168.2.213:9000");
config.set("hadoop.job.user", "hadoop");
config.set("mapreduce.framework.name", "yarn");
config.set("mapreduce.jobtracker.address", "192.168.2.213:9001");
config.set("yarn.resourcemanager.hostname", "192.168.2.213");
config.set("yarn.resourcemanager.admin.address", "192.168.2.213:8033");
config.set("yarn.resourcemanager.address", "192.168.2.213:8032");
config.set("yarn.resourcemanager.resource-tracker.address", "192.168.2.213:8036");
config.set("yarn.resourcemanager.scheduler.address", "192.168.2.213:8030");*/


config.set("fs.default.name", "hdfs://dinfo213:9000/");
config.set("hadoop.job.user","hadoop");
config.set("mapred.job.tracker","dinfo213:9001");  
try{
String inputpath = "/input/wordcount.txt";
Path outpath =new Path("/output/yjh/word");
JobConf conf = new JobConf(config,RunJob.class);
     conf.setJobName("yangjianghong");
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(IntWritable.class);
     conf.setMapperClass(WordCountMapper.class);
     conf.setReducerClass(WordCountReducer.class);
     conf.setInputFormat(TextInputFormat.class);
     conf.setOutputFormat(TextOutputFormat.class);
     //conf.setJar("F:\\Oec-HadRtas-4.0.0-jar-with-dependencies.jar");
     conf.setJarByClass(RunJob.class);
FileInputFormat.setInputPaths(conf, new Path(inputpath));
FileOutputFormat.setOutputPath(conf, outpath);
     JobClient.runJob(conf);
    


}catch (Exception e){
e.printStackTrace();
}
}
}

2.WordCountMappeer。java

package com.dinfo.hadoop;


import java.io.IOException;


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;


public class WordCountMapper implements Mapper<LongWritable, Text, Text, IntWritable>{

private Text k =new Text();
private IntWritable v =new IntWritable(1);
/**
 * 默认情况下,数据片段每次一行一行的传入maptask。每一行就是一个数据(结构是key:value。),key:该行所在文件中的下标,value:该行的内容
 */
@Override
public void configure(JobConf job) {
}
@Override
public void close() throws IOException {
}
@Override
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {

String line =value.toString();
String[] ws= StringUtils.split(line, ' ');
for(String word :ws){
k.set(word);
output.collect(k, v);
}
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

3.WordCountReduce.java

package com.dinfo.hadoop;


import java.io.IOException;
import java.util.Iterator;


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;




public class WordCountReducer implements Reducer<Text, IntWritable, Text, IntWritable>{


@Override
public void configure(JobConf job) {
// TODO Auto-generated method stub

}


@Override
public void close() throws IOException {
// TODO Auto-generated method stub

}


@Override
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
int sum=0;
while (values.hasNext())
        {
            sum += values.next().get();
        }
        output.collect(key, new IntWritable(sum));


}






}

4.ConfigurationCommon.java

package com.dinfo.oec.had.common;


import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Properties;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.log4j.Logger;


import com.dinfo.oec.hadoop.util.PropertyUtil;


/**
 * <p>Date:2016年3月24日 下午1:49:29</p>
 * <p>Module:</p>
 * <p>Description: 获取集群连接配置类</p>
 * <p>Remark: </p>
 * @author xilina
 * @version 4.0.1
 * <p>------------------------------------------------------------</p>
 * <p> Change history</p>
 * <p> Serial number: date:modified person: modification reason:</p>
 * <p> 1 </p>
 */
public class ConfigurationCommon {
public static Configuration conf;
public static Logger logger = Logger.getLogger(ConfigurationCommon.class);
public static String claster_name = PropertyUtil.getValue("claster_name");
public static String namenode_address1 = PropertyUtil.getValue("namenode_address1");
public static String namenode_address2 = PropertyUtil.getValue("namenode_address2");
public static String zookeeper_hosts = PropertyUtil.getValue("zookeeper_hosts");
public static String hadoop_home_dir = PropertyUtil.getValue("hadoop_home_dir");
public static String hadoop_user_name=PropertyUtil.getValue("hadoop_user_name");



static{
conf=getConfiguration(claster_name,namenode_address1,namenode_address2,zookeeper_hosts);
System.setProperty("hadoop.home.dir", hadoop_home_dir);
System.setProperty("HADOOP_USER_NAME",hadoop_user_name);
}


/**
 * 初始化ConfigurationCommon
 * @param in
 * @throws UnsupportedEncodingException
 * @throws IOException
 */
public static void InitConfigurationCommon(InputStream in) throws UnsupportedEncodingException, IOException{
Properties property=new Properties();
property.load(new InputStreamReader(in, "utf-8"));
claster_name=property.getProperty("claster_name");
namenode_address1=property.getProperty("namenode_address1");
namenode_address2=property.getProperty("namenode_address2");
zookeeper_hosts=property.getProperty("zookeeper_hosts");
hadoop_home_dir=property.getProperty("hadoop_home_dir");
hadoop_user_name=property.getProperty("hadoop_user_name");
conf=getConfiguration(claster_name,namenode_address1,namenode_address2,zookeeper_hosts);
System.setProperty("hadoop.home.dir", hadoop_home_dir);
System.setProperty("HADOOP_USER_NAME",hadoop_user_name);
logger.info("初始化hadoop-ConfigurationCommon完成");
logger.info("claster_name:"+claster_name+" namenode_address1:"+namenode_address1+" namenode_address:"+namenode_address1);
logger.info("zookeeper_hosts:"+zookeeper_hosts+" hadoop_home_dir:"+hadoop_home_dir+" hadoop_user_name:"+hadoop_user_name);
}

/**
 * <p>Description:hadoop环境基本信息设置</p>
 * <p>Remark:</p>
 * @param clusterName 集群名称
 * @param nn1Address namenode主机名
 * @param nn2Address namenode主机名
 * @return 连接配置对象 Configuration
*/
public static Configuration getConfiguration(String clusterName,String nn1Address,String nn2Address,String zkhosts){
Configuration conf = new Configuration();
  conf.set("fs.defaultFS", "hdfs://"+clusterName);
  conf.set("dfs.nameservices",clusterName);
  conf.set("dfs.ha.namenodes.hacluster", "nn1,nn2");
  conf.set("dfs.namenode.rpc-address.hacluster.nn1",nn1Address);
  conf.set("dfs.namenode.rpc-address.hacluster.nn2",nn2Address);
  conf.set("dfs.client.failover.proxy.provider.hacluster","org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
  conf.set("hbase.zookeeper.quorum", zkhosts);
  conf.setBoolean( "dfs.support.append", true );
      conf.set("dfs.client.block.write.replace-datanode-on-failure.policy","NEVER"); 
      conf.set("dfs.client.block.write.replace-datanode-on-failure.enable","true"); 
  conf = HBaseConfiguration.create(conf);
  return conf;
}


}

5.Hdfsutils。java

package com.dinfo.oec.hadoop.util;


import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.log4j.Logger;


import com.dinfo.oec.had.common.ConfigurationCommon;


//hdfs  数据文件  增删该查
public class HdfsUtil {
public static Configuration conf;
public static Logger logger = Logger.getLogger(HdfsUtil.class);
static{
conf=ConfigurationCommon.conf;
}

public static void main(String[] args) throws IOException {
/*Path dirpath = new Path("/aa");
FileSystem fs = getFS();*/
List<String> lsit = new ArrayList<String>();
lsit.add("2222 中国邮政poc演示");
// boolean b=HdfsUtil.createDataFile(lsit,"/input/disanalytest/sanalydata.txt","hadoop");
// System.err.println(b);
String  str=HdfsUtil.readFile("/input/disanalytest/sanalydata.txt");
System.out.println(str);
}

/**fileSystem.existes  判断是否存在
 * <p>Description:如果没有指定路径的文件目录,则创建文件目录</p>
 * <p>Remark:</p>
 * @param path 目录路径
 * @param ownerUser 所属系统用户名
 * @return
*/
public static void makeDirectory(String path,String ownerUser){
Path dirpath = new Path(path);
FileSystem fs = getFS();
try {
if(!fs.exists(dirpath)){
fs.mkdirs(dirpath);
fs.setOwner(dirpath, ownerUser, ownerUser);
logger.info("Create folder ###"+path+"### successfully!");
}else{
logger.info("Folder ###"+path+"### existed!");
}
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
 * <p>Description:判断文件系统中指定路径的文件是否存在</p>
 * <p>Remark:</p>
 * @param filepath 文件路径
 * @return 判断结果,true:存在;false:不存在
*/
public static boolean isHaveFile(String filepath){
Path path = new Path(filepath);
FileSystem fs = getFS();
try {
if(fs.exists(path)){
return true;
}else{
return false;
}
} catch (IOException e) {
e.printStackTrace();
return false;
}finally{
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**保存数据到hdfs上   fileSystem.create
 * <p>Description:保存批量分析数据至分布式文件系统</p>
 * <p>Remark:</p>
 * @param datas 分析数据
 * @param filepath 文件路径
 * @return 数据保存状态,true保存成功,否则保存失败
*/
public static boolean createDataFile(List<String> datas,String filepath,String owneruser){

Path path = new Path(filepath);
FSDataOutputStream os = null;
FileSystem fs = getFS();
try {
os = fs.create(path);
fs.setOwner(path, owneruser, owneruser);
for(String data:datas){
os.write(data.getBytes());
os.write("\r\n".getBytes());
os.flush();
}
return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}finally{
try {
if(null != os){
os.close();
}
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**  把hdfs数据流  fsdatainputStream in=fileSystem。open(new Path(filePath))  方式连接文件读取文件
 * <p>Description:读取hdfs文件系统</p>
 * <p>Remark:</p>
 * @param path 文件路径
 * @return
*/
public static String readFile(String path){
     BufferedReader br = null;
     StringBuffer buf = new StringBuffer();
     FileSystem fs = getFS();
     FSDataInputStream in = null;
     try {
in = fs.open(new Path(path));
br = new BufferedReader(new InputStreamReader(in,"utf-8"));
String line = "";
while((line = br.readLine())!=null){
line = line.split("\t")[0];
line = line.split("###")[line.split("###").length-1];
buf.append("###"+line);
}
return buf.toString().replaceFirst("###", "");
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
         if (in != null)
                    in.close();
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
     return "";
    }

/**
 * <p>Description:获取分布式文件系统配置信息</p>
 * <p>Remark:</p>
 * @return
*/
public static FileSystem getFS(){
     try {
FileSystem fs = FileSystem.get(conf);
return fs;
} catch (IOException e) {
e.printStackTrace();
}
return null;
   }

/**fileSystem.delete
 * <p>Description:删除文件</p>
 * <p>Remark:</p>
 * @param path
 */
public static void deleteFile(String path){
if(isHaveFile(path)){
FileSystem fs = getFS();
try {
fs.delete(new Path(path));
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
 * 在hdfs上创建文件,并且写入数据
 * @param data 需要追加的数据
 * @param filepath 创建文件夹的路径
 * @param owneruser 用户
 * @return
 */
public static boolean createDataFile2(String data,String filepath,String owneruser){
Path path = new Path(filepath);
FSDataOutputStream os = null;
FileSystem fs = getFS();
try {
os = fs.create(path);
fs.setOwner(path, owneruser, owneruser);

os.write(data.getBytes("utf-8"));
os.write("\r\n".getBytes());
os.flush();

return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}finally{
try {
if(null != os){
os.close();
}
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
 * 给hdfs文件系统追加数据
 * @param hdfs_path hdfs文件路径
 * @param str 追加的字符串
 * @param owneruser 用户
 * @return
 */
public static boolean appendFile(String hdfs_path,String str,String owneruser){
 
Path path = new Path(hdfs_path);
        FileSystem fs = null;
        OutputStream out=null;
        try {  
         fs = getFS();
         fs.setOwner(path, owneruser, owneruser);
         //fs = FileSystem.get(URI.create(hdfs_path), conf); 
         ByteArrayInputStream in = new ByteArrayInputStream((str+"\r\n").getBytes()); 
            out = fs.append(path);  
            IOUtils.copyBytes(in, out, 4096, true);
       
            return true;
        } catch (IOException e) {  
            e.printStackTrace();  
            return false;
        }finally {
closeAll(fs,out);
}  
}

/**
 * 关闭文件系统、关闭输出流
 * @param fs hdfs文件系统
 * @param os 输出流
 */
public static void closeAll(FileSystem fs,OutputStream os){
if(os!=null){
try {
os.flush();
os.close();
fs.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}
}


}

6.PropertyUtils.java

package com.dinfo.oec.hadoop.util;


import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Properties;


import org.apache.log4j.Logger;
import org.mortbay.log.Log;




//hadoop 配置信息转换成流    通过properties  对象load (int) 链接hadoop
public class PropertyUtil {
public static Logger logger = Logger.getLogger(PropertyUtil.class);
private static Properties p = null;
private static InputStream in;


static {
Log.info(System.getProperty("user.dir"));
Log.info(PropertyUtil.class.getClassLoader()
.getResource("hdaemonServer.properties").getPath());
in = PropertyUtil.class.getClassLoader().getResourceAsStream(
"hdaemonServer.properties");
}


public static String getValue(String key) {
if (p == null) {
try {
p = new Properties();
InputStreamReader fin = new InputStreamReader(in, "UTF-8");
p.load(fin);
} catch (Exception e) {
e.printStackTrace();
}
}
String value = p.getProperty(key);
return value;
}


}

7.hdaemonserver.propers

#1.------------------------hdaemonServer配置--------------------------------------
#分布式服务端口号
hdaemonserver.serverPort=9991
#集群服务表示,0为非集群,1为集群服务
hdaemonserver.clusterFlg=0
#zookeeper服务节点访问地址€
hdaemonserver.zookeeperHost=192.168.2.112:2191,192.168.2.212:2191,192.168.2.213:2191
#调用hadooop集群服务的可执行jar包的路径
hadoopJarPath=D:/hadoop/jar/hadoop.jar


#2.-----------------------hadoop配置---------------------------------------------
#hadoop集群名称
claster_name=hacluster
#hadoop集群namenode地址
namenode_address1=192.168.2.213:9000
#hadoop集群namenode地址
namenode_address2=192.168.2.212:9000
#hadoop集群的zookeepr地址
zookeeper_hosts=192.168.2.213:2181,192.168.2.212:2181,192.168.2.112:2181
#hadoop客户端地址
hadoop_home_dir=C:/hadoopPlun/hadoop-2.6.0
#hadoop用户名
hadoop_user_name=hadoop
#job任务输出文件路径
outpath=/output/disanalyresult

你可能感兴趣的:(hadoop经典wordcount和hdfs增加删除追加)