1. Clone代码 , 查看详情
$ git clone https://github.com/linkedin/camus.git $ cd camus
2. 新增 StringMessageDecoder 类
package com.linkedin.camus.etl.kafka.coders; import com.linkedin.camus.coders.CamusWrapper; import com.linkedin.camus.coders.Message; import com.linkedin.camus.coders.MessageDecoder; import org.apache.log4j.Logger; import java.io.UnsupportedEncodingException; import java.util.Properties; /** * MessageDecoder class that will convert the payload into a String object, * System.currentTimeMillis() will be used to set CamusWrapper's timestamp * property * * This MessageDecoder returns a CamusWrapper that works with Strings payloads, */ public class StringMessageDecoder extends MessageDecoder<Message, String> { private static final Logger log = Logger.getLogger(StringMessageDecoder.class); @Override public void init(Properties props, String topicName) { this.props = props; this.topicName = topicName; } @Override public CamusWrapper<String> decode(Message message) { // TODO Auto-generated method stub long timestamp = 0; String payloadString; try { payloadString = new String(message.getPayload(), "UTF-8"); } catch (UnsupportedEncodingException e) { log.error("Unable to load UTF-8 encoding, falling back to system default", e); payloadString = new String(message.getPayload()); } timestamp = System.currentTimeMillis(); return new CamusWrapper<String>(payloadString, timestamp); } }
3. 修改 camus下pom.xml hadoop版本
4. 配置camus.properties
$ cd camus $ cp camus-example/src/main/resources/camus.properties . $ vim camus.properties# Needed Camus properties, more cleanup to come # # Almost all properties have decent default properties. When in doubt, comment out the property. # # The job name. camus.job.name=Camus Job # final top-level data output directory, sub-directory will be dynamically created for each topic pulled etl.destination.path=/user/kafka/topics # HDFS location where you want to keep execution files, i.e. offsets, error logs, and count files etl.execution.base.path=/user/kafka/exec # where completed Camus job output directories are kept, usually a sub-dir in the base.path etl.execution.history.path=/user/kafka/camus/exec/history fs.default.name=hdfs://localhost:9000 # Concrete implementation of the Encoder class to use (used by Kafka Audit, and thus optional for now) #camus.message.encoder.class=com.linkedin.camus.etl.kafka.coders.DummyKafkaMessageEncoder # Concrete implementation of the Decoder class to use. # Out of the box options are: # com.linkedin.camus.etl.kafka.coders.JsonStringMessageDecoder - Reads JSON events, and tries to extract timestamp. # com.linkedin.camus.etl.kafka.coders.KafkaAvroMessageDecoder - Reads Avro events using a schema from a configured schema repository. # com.linkedin.camus.etl.kafka.coders.LatestSchemaKafkaAvroMessageDecoder - Same, but converts event to latest schema for current topic. ## camus.message.decoder.class=com.linkedin.camus.etl.kafka.coders.LatestSchemaKafkaAvroMessageDecoder camus.message.decoder.class=com.linkedin.camus.etl.kafka.coders.StringMessageDecoder etl.record.writer.provider.class=com.linkedin.camus.etl.kafka.common.StringRecordWriterProvider # Decoder class can also be set on a per topic basis. #camus.message.decoder.class.<topic-name>=com.your.custom.MessageDecoder # Used by avro-based Decoders (KafkaAvroMessageDecoder and LatestSchemaKafkaAvroMessageDecoder) to use as their schema registry. # Out of the box options are: # com.linkedin.camus.schemaregistry.FileSchemaRegistry # com.linkedin.camus.schemaregistry.MemorySchemaRegistry # com.linkedin.camus.schemaregistry.AvroRestSchemaRegistry # com.linkedin.camus.example.schemaregistry.DummySchemaRegistry kafka.message.coder.schema.registry.class=com.linkedin.camus.example.DummySchemaRegistry # Used by JsonStringMessageDecoder when extracting the timestamp # Choose the field that holds the time stamp (default "timestamp") #camus.message.timestamp.field=time # What format is the timestamp in? Out of the box options are: # "unix" or "unix_seconds": The value will be read as a long containing the seconds since epoc # "unix_milliseconds": The value will be read as a long containing the milliseconds since epoc # "ISO-8601": Timestamps will be fed directly into org.joda.time.DateTime constructor, which reads ISO-8601 # All other values will be fed into the java.text.SimpleDateFormat constructor, which will be used to parse the timestamps # Default is "[dd/MMM/yyyy:HH:mm:ss Z]" #camus.message.timestamp.format=yyyy-MM-dd_HH:mm:ss #camus.message.timestamp.format=ISO-8601 # Used by the committer to arrange .avro files into a partitioned scheme. This will be the default partitioner for all # topic that do not have a partitioner specified. # Out of the box options are (for all options see the source for configuration options): # com.linkedin.camus.etl.kafka.partitioner.HourlyPartitioner, groups files in hourly directories # com.linkedin.camus.etl.kafka.partitioner.DailyPartitioner, groups files in daily directories # com.linkedin.camus.etl.kafka.partitioner.TimeBasedPartitioner, groups files in very configurable directories # com.linkedin.camus.etl.kafka.partitioner.DefaultPartitioner, like HourlyPartitioner but less configurable # com.linkedin.camus.etl.kafka.partitioner.TopicGroupingPartitioner #etl.partitioner.class=com.linkedin.camus.etl.kafka.partitioner.HourlyPartitioner # Partitioners can also be set on a per-topic basis. (Note though that configuration is currently not per-topic.) #etl.partitioner.class.<topic-name>=com.your.custom.CustomPartitioner # all files in this dir will be added to the distributed cache and placed on the classpath for hadoop tasks # hdfs.default.classpath.dir= # max hadoop tasks to use, each task can pull multiple topic partitions mapred.map.tasks=30 # max historical time that will be pulled from each partition based on event timestamp kafka.max.pull.hrs=1 # events with a timestamp older than this will be discarded. kafka.max.historical.days=3 # Max minutes for each mapper to pull messages (-1 means no limit) kafka.max.pull.minutes.per.task=-1 # if whitelist has values, only whitelisted topic are pulled. Nothing on the blacklist is pulled kafka.blacklist.topics= kafka.whitelist.topics=app_log_raw log4j.configuration=true # Name of the client as seen by kafka kafka.client.name=camus # The Kafka brokers to connect to, format: kafka.brokers=host1:port,host2:port,host3:port kafka.brokers=localhost:9092,localhost:9093,localhost:9094 # Fetch request parameters: #kafka.fetch.buffer.size= #kafka.fetch.request.correlationid= #kafka.fetch.request.max.wait= #kafka.fetch.request.min.bytes= #kafka.timeout.value= #Stops the mapper from getting inundated with Decoder exceptions for the same topic #Default value is set to 10 max.decoder.exceptions.to.print=5 #Controls the submitting of counts to Kafka #Default value set to true post.tracking.counts.to.kafka=false monitoring.event.class=class.that.generates.record.to.submit.counts.to.kafka # everything below this point can be ignored for the time being, will provide more documentation down the road ########################## etl.run.tracking.post=false kafka.monitor.tier= etl.counts.path= kafka.monitor.time.granularity=10 #etl.hourly=hourly etl.daily=daily # Should we ignore events that cannot be decoded (exception thrown by MessageDecoder)? # `false` will fail the job, `true` will silently drop the event. etl.ignore.schema.errors=false # configure output compression for deflate or snappy. Defaults to deflate mapred.output.compress=false etl.output.codec=gzip etl.deflate.level=6 #etl.output.codec=snappy etl.default.timezone=America/Los_Angeles etl.output.file.time.partition.mins=60 etl.keep.count.files=false etl.execution.history.max.of.quota=.8 # Configures a customer reporter which extends BaseReporter to send etl data #etl.reporter.class mapred.map.max.attempts=1 kafka.client.buffer.size=20971520 kafka.client.so.timeout=60000 #zookeeper.session.timeout= #zookeeper.connection.timeout=
5. 编译打包
$ mvn clean [compile] package [-DskipTests] []中的参数看情况带不带都行
6. 运行脚本
$ cd camus-example $ cp target/camus-example-0.1.0-SNAPSHOT-shaded.jar . $ hadoop jar camus-example-0.1.0-SNAPSHOT-shaded.jar com.linkedin.camus.etl.kafka.CamusJob -P ../camus.properties
7. 错误异常处理
Exception in thread "main" java.io.IOException: Mkdirs failed to create /var/folders/hb/s74xd1353n92ht1h8pr8mngh0000gn/T/hadoop-unjar7038503362115517488/META-INF/license at org.apache.hadoop.util.RunJar.ensureDirectory(RunJar.java:128) at org.apache.hadoop.util.RunJar.unJar(RunJar.java:104) at org.apache.hadoop.util.RunJar.unJar(RunJar.java:81) at org.apache.hadoop.util.RunJar.run(RunJar.java:209) at org.apache.hadoop.util.RunJar.main(RunJar.java:136) 上述错误信息由于Mac 上的特殊原因导致的貌似。 http://stackoverflow.com/questions/10522835/hadoop-java-io-ioexception-mkdirs-failed-to-create-some-path
修改jar $ zip -d camus-example-0.1.0-SNAPSHOT-shaded.jar META-INF/LICENSE $ zip -d camus-example-0.1.0-SNAPSHOT-shaded.jar LICENSE 上述命令删除jar包种指定信息 $ jar -tvf camus-example-0.1.0-SNAPSHOT-shaded.jar |grep META-INF/LICENSE
8. 查看HDFS上对应文件