(); // registry
/** Get a comparator for a {@link WritableComparable} implementation. */
public static WritableComparator get(Class extends WritableComparable> c) {
WritableComparator comparator = comparators.get(c);
if (comparator == null) {
// force the static initializers to run
forceInit(c);
// look to see if it is defined now
comparator = comparators.get(c);
// if not, use the generic one
if (comparator == null) {
comparator = new WritableComparator(c, true);
}
}
return comparator;
}
/**
* Force initialization of the static members.
* As of Java 5, referencing a class doesn't force it to initialize. Since
* this class requires that the classes be initialized to declare their
* comparators, we force that initialization to happen.
* @param cls the class to initialize
*/
private static void forceInit(Class> cls) {
try {
Class.forName(cls.getName(), true, cls.getClassLoader());
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException("Can't initialize class " + cls, e);
}
}
/** Register an optimized comparator for a {@link WritableComparable}
* implementation. Comparators registered with this method must be
* thread-safe. */
public static void define(Class c, WritableComparator comparator) {
comparators.put(c, comparator);
}
private final Class extends WritableComparable> keyClass;
private final WritableComparable key1;
private final WritableComparable key2;
private final DataInputBuffer buffer;
protected WritableComparator() {
this(null);
}
/** Construct for a {@link WritableComparable} implementation. */
protected WritableComparator(Class extends WritableComparable> keyClass) {
this(keyClass, false);
}
protected WritableComparator(Class extends WritableComparable> keyClass,
boolean createInstances) {
this.keyClass = keyClass;
if (createInstances) {
key1 = newKey();
key2 = newKey();
buffer = new DataInputBuffer();
} else {
key1 = key2 = null;
buffer = null;
}
}
/** Returns the WritableComparable implementation class. */
public Class extends WritableComparable> getKeyClass() { return keyClass; }
/** Construct a new {@link WritableComparable} instance. */
public WritableComparable newKey() {
return ReflectionUtils.newInstance(keyClass, null);
}
/** Optimization hook. Override this to make SequenceFile.Sorter's scream.
*
* The default implementation reads the data into two {@link
* WritableComparable}s (using {@link
* Writable#readFields(DataInput)}, then calls {@link
* #compare(WritableComparable,WritableComparable)}.
*/
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
try {
buffer.reset(b1, s1, l1); // parse key1
key1.readFields(buffer);
buffer.reset(b2, s2, l2); // parse key2
key2.readFields(buffer);
} catch (IOException e) {
throw new RuntimeException(e);
}
return compare(key1, key2); // compare them
}
/** Compare two WritableComparables.
*
*
The default implementation uses the natural ordering, calling {@link
* Comparable#compareTo(Object)}. */
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
return a.compareTo(b);
}
@Override
public int compare(Object a, Object b) {
return compare((WritableComparable)a, (WritableComparable)b);
}
/** Lexicographic order of binary data. */
public static int compareBytes(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
return FastByteComparisons.compareTo(b1, s1, l1, b2, s2, l2);
}
/** Compute hash for binary data. */
public static int hashBytes(byte[] bytes, int offset, int length) {
int hash = 1;
for (int i = offset; i < offset + length; i++)
hash = (31 * hash) + (int)bytes[i];
return hash;
}
/** Compute hash for binary data. */
public static int hashBytes(byte[] bytes, int length) {
return hashBytes(bytes, 0, length);
}
/** Parse an unsigned short from a byte array. */
public static int readUnsignedShort(byte[] bytes, int start) {
return (((bytes[start] & 0xff) << 8) +
((bytes[start+1] & 0xff)));
}
/** Parse an integer from a byte array. */
public static int readInt(byte[] bytes, int start) {
return (((bytes[start ] & 0xff) << 24) +
((bytes[start+1] & 0xff) << 16) +
((bytes[start+2] & 0xff) << 8) +
((bytes[start+3] & 0xff)));
}
/** Parse a float from a byte array. */
public static float readFloat(byte[] bytes, int start) {
return Float.intBitsToFloat(readInt(bytes, start));
}
/** Parse a long from a byte array. */
public static long readLong(byte[] bytes, int start) {
return ((long)(readInt(bytes, start)) << 32) +
(readInt(bytes, start+4) & 0xFFFFFFFFL);
}
/** Parse a double from a byte array. */
public static double readDouble(byte[] bytes, int start) {
return Double.longBitsToDouble(readLong(bytes, start));
}
/**
* Reads a zero-compressed encoded long from a byte array and returns it.
* @param bytes byte array with decode long
* @param start starting index
* @throws java.io.IOException
* @return deserialized long
*/
public static long readVLong(byte[] bytes, int start) throws IOException {
int len = bytes[start];
if (len >= -112) {
return len;
}
boolean isNegative = (len < -120);
len = isNegative ? -(len + 120) : -(len + 112);
if (start+1+len>bytes.length)
throw new IOException(
"Not enough number of bytes for a zero-compressed integer");
long i = 0;
for (int idx = 0; idx < len; idx++) {
i = i << 8;
i = i | (bytes[start+1+idx] & 0xFF);
}
return (isNegative ? (i ^ -1L) : i);
}
/**
* Reads a zero-compressed encoded integer from a byte array and returns it.
* @param bytes byte array with the encoded integer
* @param start start index
* @throws java.io.IOException
* @return deserialized integer
*/
public static int readVInt(byte[] bytes, int start) throws IOException {
return (int) readVLong(bytes, start);
}
}
Comparator
org.apache.hadoop.io.Text.Comparator
/** A WritableComparator optimized for Text keys. */
public static class Comparator extends WritableComparator {
public Comparator() {
super(Text.class);
}
@Override
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
int n1 = WritableUtils.decodeVIntSize(b1[s1]);
int n2 = WritableUtils.decodeVIntSize(b2[s2]);
return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
}
}
37、Map side Join说明(replicated Join)
步骤:
1)把小数据集放在LocalResource
job.addCacheFile(new URI(“afile.txt”));
job.addCacheArchive(new URI(“helper.zip”));
2)在mapper的setup()方法中,获取小数据集,并加载到内存中(使用hashmap、treemap等数据结构)
3)当大数据集记录被map()方法读取时,使用小数据集进行匹配过滤
4)如果匹配成功,把两个数据集的记录拼接,并作为output输出
LocalResource表示运行container需要的文件、库,典型的LocalResource有:运行container所需要的jar包、container运行前所需要的配置(远程服务URLs、程序配置等)、静态字典文件等。NodeManager负责在运行container之前,Localize相关资源。
38、Hash Map说明
了解HashMap之前,我们需要知道Object类的两个方法hashCode和equals,我们先来看一下这两个方法的默认实现:
[java] view plaincopyprint?在CODE上查看代码片派生到我的代码片
/** JNI,调用底层其它语言实现 */
public native int hashCode();
/** 默认同==,直接比较对象 */
public boolean equals(Object obj) {
return (this == obj);
}
equals方法我们太熟悉了,我们经常用于字符串比较,String类中重写了equals方法,比较的是字符串值,看一下源码实现:
[java] view plaincopyprint?在CODE上查看代码片派生到我的代码片
public boolean equals(Object anObject) {
if (this == anObject) {
return true;
}
if (anObject instanceof String) {
String anotherString = (String) anObject;
int n = value.length;
if (n == anotherString.value.length) {
char v1[] = value;
char v2[] = anotherString.value;
int i = 0;
// 逐个判断字符是否相等
while (n-- != 0) {
if (v1[i] != v2[i])
return false;
i++;
}
return true;
}
}
return false;
}
重写equals要满足几个条件:
自反性:对于任何非空引用值 x,x.equals(x) 都应返回 true。
对称性:对于任何非空引用值 x 和 y,当且仅当 y.equals(x) 返回 true 时,x.equals(y) 才应返回 true。
传递性:对于任何非空引用值 x、y 和 z,如果 x.equals(y) 返回 true,并且 y.equals(z) 返回 true,那么 x.equals(z) 应返回 true。
一致性:对于任何非空引用值 x 和 y,多次调用 x.equals(y) 始终返回 true 或始终返回 false,前提是对象上 equals 比较中所用的信息没有被修改。
对于任何非空引用值 x,x.equals(null) 都应返回 false。
Object 类的 equals 方法实现对象上差别可能性最大的相等关系;即,对于任何非空引用值 x 和 y,当且仅当 x 和 y 引用同一个对象时,此方法才返回 true(x == y 具有值 true)。 当此方法被重写时,通常有必要重写 hashCode 方法,以维护 hashCode 方法的常规协定,该协定声明相等对象必须具有相等的哈希码。
下面来说说hashCode方法,这个方法我们平时通常是用不到的,它是为哈希家族的集合类框架(HashMap、HashSet、HashTable)提供服务,hashCode 的常规协定是:
在 Java 应用程序执行期间,在同一对象上多次调用 hashCode 方法时,必须一致地返回相同的整数,前提是对象上 equals 比较中所用的信息没有被修改。从某一应用程序的一次执行到同一应用程序的另一次执行,该整数无需保持一致。
如果根据 equals(Object) 方法,两个对象是相等的,那么在两个对象中的每个对象上调用 hashCode 方法都必须生成相同的整数结果。
以下情况不 是必需的:如果根据 equals(java.lang.Object) 方法,两个对象不相等,那么在两个对象中的任一对象上调用 hashCode 方法必定会生成不同的整数结果。但是,程序员应该知道,为不相等的对象生成不同整数结果可以提高哈希表的性能。
当我们看到实现这两个方法有这么多要求时,立刻凌乱了,幸好有IDE来帮助我们,Eclipse中可以通过快捷键alt+shift+s调出快捷菜单,选择Generate hashCode() and equals(),根据业务需求,勾选需要生成的属性,确定之后,这两个方法就生成好了,我们通常需要在JavaBean对象中重写这两个方法。
好了,这两个方法介绍完之后,我们回到HashMap。HashMap是最常用的集合类框架之一,它实现了Map接口,所以存储的元素也是键值对映射的结构,并允许使用null值和null键,其内元素是无序的,如果要保证有序,可以使用LinkedHashMap。HashMap是线程不安全的,下篇文章会讨论。HashMap的类结构如下:
java.util
类 HashMap
java.lang.Object
继承者 java.util.AbstractMap
继承者 java.util.HashMap
所有已实现的接口:
Serializable,Cloneable,Map
直接已知子类:
LinkedHashMap,PrinterStateReasons
HashMap中我们最长用的就是put(K, V)和get(K)。我们都知道,HashMap的K值是唯一的,那如何保证唯一性呢?我们首先想到的是用equals比较,没错,这样可以实现,但随着内部元素的增多,put和get的效率将越来越低,这里的时间复杂度是O(n),假如有1000个元素,put时需要比较1000次。实际上,HashMap很少会用到equals方法,因为其内通过一个哈希表管理所有元素,哈希是通过hash单词音译过来的,也可以称为散列表,哈希算法可以快速的存取元素,当我们调用put存值时,HashMap首先会调用K的hashCode方法,获取哈希码,通过哈希码快速找到某个存放位置,这个位置可以被称之为bucketIndex,通过上面所述hashCode的协定可以知道,如果hashCode不同,equals一定为false,如果hashCode相同,equals不一定为true。所以理论上,hashCode可能存在冲突的情况,有个专业名词叫碰撞,当碰撞发生时,计算出的bucketIndex也是相同的,这时会取到bucketIndex位置已存储的元素,最终通过equals来比较,equals方法就是哈希码碰撞时才会执行的方法,所以前面说HashMap很少会用到equals。HashMap通过hashCode和equals最终判断出K是否已存在,如果已存在,则使用新V值替换旧V值,并返回旧V值,如果不存在 ,则存放新的键值对到bucketIndex位置。
现在我们知道,执行put方法后,最终HashMap的存储结构会有这三种情况,情形3是最少发生的,哈希码发生碰撞属于小概率事件。到目前为止,我们了解了两件事:
HashMap通过键的hashCode来快速的存取元素。
当不同的对象hashCode发生碰撞时,HashMap通过单链表来解决,将新元素加入链表表头,通过next指向原有的元素。单链表在Java中的实现就是对象的引用(复合)。
来鉴赏一下HashMap中put方法源码:
public V put(K key, V value) {
// 处理key为null,HashMap允许key和value为null
if (key == null)
return putForNullKey(value);
// 得到key的哈希码
int hash = hash(key);
// 通过哈希码计算出bucketIndex
int i = indexFor(hash, table.length);
// 取出bucketIndex位置上的元素,并循环单链表,判断key是否已存在
for (Entry e = table[i]; e != null; e = e.next) {
Object k;
// 哈希码相同并且对象相同时
if (e.hash == hash && ((k = e.key) == key || key.equals(k))) {
// 新值替换旧值,并返回旧值
V oldValue = e.value;
e.value = value;
e.recordAccess(this);
return oldValue;
}
}
// key不存在时,加入新元素
modCount++;
addEntry(hash, key, value, i);
return null;
}
到这里,我们了解了HashMap工作原理的一部分,那还有另一部分,如,加载因子及rehash,HashMap通常的使用规则,多线程并发时HashMap存在的问题等等,这些会留在下一章说明。
本文来自:高爽|Coder,原文地址:http://blog.csdn.net/ghsau/article/details/16843543,转载请注明。
java.util.HashMap
Hash table based implementation of the Map interface. This implementation provides all of the optional map operations, and permits null values and the null key. (The HashMap class is roughly equivalent to Hashtable, except that it is unsynchronized and permits nulls.) This class makes no guarantees as to the order of the map; in particular, it does not guarantee that the order will remain constant over time.
This implementation provides constant-time performance for the basic operations (get and put), assuming the hash function disperses the elements properly among the buckets. Iteration over collection views requires time proportional to the "capacity" of the HashMap instance (the number of buckets) plus its size (the number of key-value mappings). Thus, it's very important not to set the initial capacity too high (or the load factor too low) if iteration performance is important.
An instance of HashMap has two parameters that affect its performance: initial capacity and load factor. The capacity is the number of buckets in the hash table, and the initial capacity is simply the capacity at the time the hash table is created. The load factor is a measure of how full the hash table is allowed to get before its capacity is automatically increased. When the number of entries in the hash table exceeds the product of the load factor and the current capacity, the hash table is rehashed (that is, internal data structures are rebuilt) so that the hash table has approximately twice the number of buckets.
As a general rule, the default load factor (.75) offers a good tradeoff between time and space costs. Higher values decrease the space overhead but increase the lookup cost (reflected in most of the operations of the HashMap class, including get and put). The expected number of entries in the map and its load factor should be taken into account when setting its initial capacity, so as to minimize the number of rehash operations. If the initial capacity is greater than the maximum number of entries divided by the load factor, no rehash operations will ever occur.
If many mappings are to be stored in a HashMap instance, creating it with a sufficiently large capacity will allow the mappings to be stored more efficiently than letting it perform automatic rehashing as needed to grow the table. Note that using many keys with the same hashCode() is a sure way to slow down performance of any hash table. To ameliorate impact, when keys are Comparable, this class may use comparison order among keys to help break ties.
Note that this implementation is not synchronized. If multiple threads access a hash map concurrently, and at least one of the threads modifies the map structurally, it must be synchronized externally. (A structural modification is any operation that adds or deletes one or more mappings; merely changing the value associated with a key that an instance already contains is not a structural modification.) This is typically accomplished by synchronizing on some object that naturally encapsulates the map. If no such object exists, the map should be "wrapped" using the Collections.synchronizedMap method. This is best done at creation time, to prevent accidental unsynchronized access to the map:
Map m = Collections.synchronizedMap(new HashMap(...));
The iterators returned by all of this class's "collection view methods" are fail-fast: if the map is structurally modified at any time after the iterator is created, in any way except through the iterator's own remove method, the iterator will throw a ConcurrentModificationException. Thus, in the face of concurrent modification, the iterator fails quickly and cleanly, rather than risking arbitrary, non-deterministic behavior at an undetermined time in the future.
Note that the fail-fast behavior of an iterator cannot be guaranteed as it is, generally speaking, impossible to make any hard guarantees in the presence of unsynchronized concurrent modification. Fail-fast iterators throw ConcurrentModificationException on a best-effort basis. Therefore, it would be wrong to write a program that depended on this exception for its correctness: the fail-fast behavior of iterators should be used only to detect bugs.
This class is a member of the Java Collections Framework.
Type Parameters:
the type of keys maintained by this map
the type of mapped values
Since:
1.2
Author:
Doug Lea
Josh Bloch
Arthur van Hoff
Neal Gafter
See Also:
Object.hashCode()
Collection
Map
TreeMap
Hashtable
39、URI Class说明
URIs, URLs, and URNs
首先,URI,是uniform resource identifier,统一资源标识符,用来唯一的标识一个资源。而URL是uniform resource locator,统一资源定位器,它是一种具体的URI,即URL可以用来标识一个资源,而且还指明了如何locate这个资源。而URN,uniform resource name,统一资源命名,是通过名字来标识资源,比如mailto:[email protected]。也就是说,URI是以一种抽象的,高层次概念定义统一资源标识,而URL和URN则是具体的资源标识的方式。URL和URN都是一种URI。
在Java的URI中,一个URI实例可以代表绝对的,也可以是相对的,只要它符合URI的语法规则。而URL类则不仅符合语义,还包含了定位该资源的信息,因此它不能是相对的,schema必须被指定。
ok,现在回答文章开头提出的问题,到底是imgUrl好呢,还是imgUri好?显然,如果说imgUri是肯定没问题的,因为即使它实际上是url,那它也是uri的一种。那么用imgUrl有没有问题呢?此时则要看它的可能取值,如果是绝对路径,能够定位的,那么用imgUrl是没问题的,而如果是相对路径,那还是不要用ImgUrl的好。总之,用imgUri是肯定没问题的,而用imgUrl则要视实际情况而定。
第二个,从HttpServletRequest的javadoc中可以看出,getRequestURI返回一个String,“the part of this request’s URL from the protocol name up to the query string in the first line of the HTTP request”,比如“POST /some/path.html?a=b HTTP/1.1”,则返回的值为”/some/path.html”。现在可以明白为什么是getRequestURI而不是getRequestURL了,因为此处返回的是相对的路径。而getRequestURL返回一个StringBuffer,“The returned URL contains a protocol, server name, port number, and server path, but it does not include query string parameters.”,完整的请求资源路径,不包括querystring。
总结一下:URL是一种具体的URI,它不仅唯一标识资源,而且还提供了定位该资源的信息。URI是一种语义上的抽象概念,可以是绝对的,也可以是相对的,而URL则必须提供足够的信息来定位,所以,是绝对的,而通常说的relative URL,则是针对另一个absolute URL,本质上还是绝对的。
注:这里的绝对(absolute)是指包含scheme,而相对(relative)则不包含scheme。
URI抽象结构 [scheme:]scheme-specific-part[#fragment]
[scheme:][//authority][path][?query][#fragment]
authority为[user-info@]host[:port]
参考资料:
http://docs.oracle.com/javase/1.5.0/docs/api/java/net/URI.html
http://en.wikipedia.org/wiki/Uniform_Resource_Identifier
http://docs.oracle.com/javaee/5/api/javax/servlet/http/HttpServletRequest.html
ps:
java.net.URL类不提供对标准RFC2396规定的特殊字符的转义,因此需要调用者自己对URL各组成部分进行encode。而java.net.URI则会提供转义功能。因此The recommended way to manage the encoding and decoding of URLs is to use java.net.URI. 可以使用URI.toURL()和URL.toURI()方法来对两个类型的对象互相转换。对于HTML FORM的url encode/decode可以使用java.net.URLEncoder和java.net.URLDecoder来完成,但是对URL对象不适用。
Constructs a URI by parsing the given string.
This constructor parses the given string exactly as specified by the grammar in RFC 2396, Appendix A, except for the following deviations:
An empty authority component is permitted as long as it is followed by a non-empty path, a query component, or a fragment component. This allows the parsing of URIs such as "file:///foo/bar", which seems to be the intent of RFC 2396 although the grammar does not permit it. If the authority component is empty then the user-information, host, and port components are undefined.
Empty relative paths are permitted; this seems to be the intent of RFC 2396 although the grammar does not permit it. The primary consequence of this deviation is that a standalone fragment such as "#foo" parses as a relative URI with an empty path and the given fragment, and can be usefully resolved against a base URI.
IPv4 addresses in host components are parsed rigorously, as specified by RFC 2732: Each element of a dotted-quad address must contain no more than three decimal digits. Each element is further constrained to have a value no greater than 255.
Hostnames in host components that comprise only a single domain label are permitted to start with an alphanum character. This seems to be the intent of RFC 2396 section 3.2.2 although the grammar does not permit it. The consequence of this deviation is that the authority component of a hierarchical URI such as s://123, will parse as a server-based authority.
IPv6 addresses are permitted for the host component. An IPv6 address must be enclosed in square brackets ('[' and ']') as specified by RFC 2732. The IPv6 address itself must parse according to RFC 2373. IPv6 addresses are further constrained to describe no more than sixteen bytes of address information, a constraint implicit in RFC 2373 but not expressible in the grammar.
Characters in the other category are permitted wherever RFC 2396 permits escaped octets, that is, in the user-information, path, query, and fragment components, as well as in the authority component if the authority is registry-based. This allows URIs to contain Unicode characters beyond those in the US-ASCII character set.
Parameters:
str The string to be parsed into a URI
Throws:
NullPointerException - If str is null
URISyntaxException - If the given string violates RFC 2396, as augmented by the above deviations
39、执行jar包出错,nullPointerException
15/12/15 22:50:46 INFO mapreduce.Job: Task Id : attempt_1450188830584_0013_m_000000_1, Status : FAILED
Error: java.lang.NullPointerException
at mapsidejoin.Mapsidejoin$MapsideMapper.map(Mapsidejoin.java:37)
at mapsidejoin.Mapsidejoin$MapsideMapper.map(Mapsidejoin.java:26)
错误原因:hashmap使用containKey方法,需要调用key class的equals()方法
解决方法:做stock class(implements WritableComparable)漏重构方法equal()和hashCode()
Java code
/** * Returns the entry associated with the specified key in the * HashMap. Returns null if the HashMap contains no mapping * for the key. */
final Entry getEntry(Object key) {
int hash = (key == null) 0 : hash(key.hashCode());
for (Entry e = table[indexFor(hash, table.length)]; e != null; e = e.next) {
Object k;
if (e.hash == hash && ((k = e.key) == key || (key != null && key.equals(k))))
return e;
}
return null; }
40、执行jar包出错,unknowHostexception
root@HWX_Java:~/java/labs/Solutions/Lab7.1/MapSideJoin# yarn jar mapsidejoin.jar AIT
15/12/15 23:07:36 INFO impl.TimelineClientImpl: Timeline service address: http://resourcemanager:8188/ws/v1/timeline/
15/12/15 23:07:36 INFO client.RMProxy: Connecting to ResourceManager at resourcemanager/172.17.0.3:8050
15/12/15 23:07:37 INFO mapreduce.JobSubmitter: Cleaning up the staging area /user/root/.staging/job_1450188830584_0017
java.lang.IllegalArgumentException: java.net.UnknownHostException: sandbox
41、在Map Side Join中,使用hashmap存储小表的时候,需要对key的对象重载hashCode()函数,否则hashmap的containsKey()函数找不到对应内容
42、StringUtils.split,对于分列为空字符串,代码会自动删除(因StringUtils.split中,默认separatorChars为false)。如果需要保留,建议使用String.split,或者使用StringUtils.splitWorker(str,separatorchar,-1,true)。
public static String[] split(String str, String separatorChars) {
return splitWorker(str, separatorChars, -1, false);
}
/**
* Performs the logic for the split
and
* splitPreserveAllTokens
methods that do not return a
* maximum array length.
*
* @param str the String to parse, may be null
* @param separatorChar the separate character
* @param preserveAllTokens if true
, adjacent separators are
* treated as empty token separators; if false
, adjacent
* separators are treated as one separator.
* @return an array of parsed Strings, null
if null String input
*/
private static String[] splitWorker(String str, char separatorChar, boolean preserveAllTokens) {
43、reduce side join,需要在map outputKey和map outputValuel里面都设置flag。
outputKey:用于排序数据集;
outputValue flag:用于剔除数据集中记录不存在的情况。
44、设置Partitioner()出错,提示非法分区
16/06/10 13:07:58 INFO mapreduce.Job: Task Id : attempt_1465527536456_0018_m_000000_0, Status : FAILED
Error: java.io.IOException: Illegal partition for Theoldmanandsea,!termnum (-2)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1082)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at tfidf.Tfidf$Tfmapper.cleanup(Tfidf.java:111)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:149)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
错误原因:在设置Partitioner函数中,使用key.getYear(String类型)与numPartitions取模作为返回值。由于取模函数支持负数运算,并且运算符号与被除数一致。当key.getYear.hashCode()返回负数时候, (key.getYear.hashCode()%numPartitions也是负数,导致程序出错。
解决方法:修改为 Math.abs(key.getYear.hashCode()%numPartitions);
public static class Exampartitioner extends Partitioner {
@Override
public int getPartition(Airport key, DoubleWritable value,
int numPartitions) {
// TODO Auto-generated method stub
return key.getYear.hashCode()%numPartitions;
}
}
45、partitioner的判断条件需等于Group的条件,或者是Group条件的子集。
46、TotalOrderPartitioner出错,提示如下,
16/06/09 14:01:20 INFO mapreduce.Job: Task Id : attempt_1465443169491_0015_m_000000_2, Status : FAILED
Error: java.lang.IllegalArgumentException: Can't read partitions file
at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.setConf(TotalOrderPartitioner.java:116)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:76)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:136)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.(MapTask.java:701)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:770)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.io.IOException: Wrong number of partitions in keyset
at org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner.setConf(TotalOrderPartitioner.java:90)
... 10 more
问题原因:job.setNumReduceTask(3);放在了InputSampler.Sampler的后面。
解决方法:把job.setNumReduceTask(3)放在前面。