Apache Tika是怎么识别待解析文档的mime类型的,是怎么根据mime类型得到相应的解析类Parser的,如果我们添加自定义mime类型以及相应的解析类,又该怎么处理呢?前面的文章还没有具体解决这些关键问题
在tika-core的jar路径org.apache.tika.mime下有一tika-mimetypes.xml文件,里面记录了tika支持的mime类型,文件格式如下
<mime-info> <mime-type type="application/activemessage"/> <mime-type type="application/andrew-inset"> <glob pattern="*.ez"/> </mime-type> <mime-type type="application/applefile"/> <mime-type type="application/applixware"> <glob pattern="*.aw"/> </mime-type> <mime-type type="application/atom+xml"> <root-XML localName="feed" namespaceURI="http://purl.org/atom/ns#"/> <glob pattern="*.atom"/> </mime-type> <mime-info>
因为文件比较长,我这里就只贴出来了一部分
先熟悉一下tika的mime类型加载及mime类型检测的相关UML模型图
Apache Tika是通过SAX方式来解析该XML文件的,事件处理类MimeTypesReader的源码如下:
class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys { private final MimeTypes types; /** Current type */ private MimeType type = null; private int priority; private StringBuilder characters = null; MimeTypesReader(MimeTypes types) { this.types = types; } void read(InputStream stream) throws IOException, MimeTypeException { try { SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(false); SAXParser parser = factory.newSAXParser(); parser.parse(stream, this); } catch (ParserConfigurationException e) { throw new MimeTypeException("Unable to create an XML parser", e); } catch (SAXException e) { throw new MimeTypeException("Invalid type configuration", e); } } void read(Document document) throws MimeTypeException { try { TransformerFactory factory = TransformerFactory.newInstance(); Transformer transformer = factory.newTransformer(); transformer.transform(new DOMSource(document), new SAXResult(this)); } catch (TransformerException e) { throw new MimeTypeException("Failed to parse type registry", e); } } @Override public InputSource resolveEntity(String publicId, String systemId) { return new InputSource(new ByteArrayInputStream(new byte[0])); } @Override public void startElement( String uri, String localName, String qName, Attributes attributes) throws SAXException { if (type == null) { if (MIME_TYPE_TAG.equals(qName)) { String name = attributes.getValue(MIME_TYPE_TYPE_ATTR); try { type = types.forName(name); } catch (MimeTypeException e) { throw new SAXException(e); } } } else if (ALIAS_TAG.equals(qName)) { String alias = attributes.getValue(ALIAS_TYPE_ATTR); types.addAlias(type, MediaType.parse(alias)); } else if (SUB_CLASS_OF_TAG.equals(qName)) { String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR); types.setSuperType(type, MediaType.parse(parent)); } else if (COMMENT_TAG.equals(qName)) { characters = new StringBuilder(); } else if (GLOB_TAG.equals(qName)) { String pattern = attributes.getValue(PATTERN_ATTR); String isRegex = attributes.getValue(ISREGEX_ATTR); if (pattern != null) { try { types.addPattern(type, pattern, Boolean.valueOf(isRegex)); } catch (MimeTypeException e) { throw new SAXException(e); } } } else if (ROOT_XML_TAG.equals(qName)) { String namespace = attributes.getValue(NS_URI_ATTR); String name = attributes.getValue(LOCAL_NAME_ATTR); type.addRootXML(namespace, name); } else if (MATCH_TAG.equals(qName)) { String kind = attributes.getValue(MATCH_TYPE_ATTR); String offset = attributes.getValue(MATCH_OFFSET_ATTR); String value = attributes.getValue(MATCH_VALUE_ATTR); String mask = attributes.getValue(MATCH_MASK_ATTR); if (kind == null) { kind = "string"; } current = new ClauseRecord( new MagicMatch(type.getType(), kind, offset, value, mask)); } else if (MAGIC_TAG.equals(qName)) { String value = attributes.getValue(MAGIC_PRIORITY_ATTR); if (value != null && value.length() > 0) { priority = Integer.parseInt(value); } else { priority = 50; } current = new ClauseRecord(null); } } @Override public void endElement(String uri, String localName, String qName) { if (type != null) { if (MIME_TYPE_TAG.equals(qName)) { type = null; } else if (COMMENT_TAG.equals(qName)) { type.setDescription(characters.toString().trim()); characters = null; } else if (MATCH_TAG.equals(qName)) { current.stop(); } else if (MAGIC_TAG.equals(qName)) { for (Clause clause : current.getClauses()) { type.addMagic(new Magic(type, priority, clause)); } current = null; } } } @Override public void characters(char[] ch, int start, int length) { if (characters != null) { characters.append(ch, start, length); } } private ClauseRecord current = new ClauseRecord(null); private class ClauseRecord { private ClauseRecord parent; private Clause clause; private List<Clause> subclauses = null; public ClauseRecord(Clause clause) { this.parent = current; this.clause = clause; } public void stop() { if (subclauses != null) { Clause subclause; if (subclauses.size() == 1) { subclause = subclauses.get(0); } else { subclause = new OrClause(subclauses); } clause = new AndClause(clause, subclause); } if (parent.subclauses == null) { parent.subclauses = Collections.singletonList(clause); } else { if (parent.subclauses.size() == 1) { parent.subclauses = new ArrayList<Clause>(parent.subclauses); } parent.subclauses.add(clause); } current = current.parent; } public List<Clause> getClauses() { return subclauses; } } }
这里的关键方法是void read(InputStream stream),调用SAXParser的parse方法执行事件处理,解析tika-mimetypes.xml文件,初始化MimeTypes types成员变量
接下来我们来看MimeTypesFactory类的源码:
/** * Creates instances of MimeTypes. */ public class MimeTypesFactory { /** * Creates an empty instance; same as calling new MimeTypes(). * * @return an empty instance */ public static MimeTypes create() { return new MimeTypes(); } /** * Creates and returns a MimeTypes instance from the specified document. * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(Document document) throws MimeTypeException { MimeTypes mimeTypes = new MimeTypes(); new MimeTypesReader(mimeTypes).read(document); mimeTypes.init(); return mimeTypes; } /** * Creates and returns a MimeTypes instance from the specified input stream. * Does not close the input stream(s). * @throws IOException if the stream can not be read * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(InputStream... inputStreams) throws IOException, MimeTypeException { MimeTypes mimeTypes = new MimeTypes(); MimeTypesReader reader = new MimeTypesReader(mimeTypes); for(InputStream inputStream : inputStreams) { reader.read(inputStream); } mimeTypes.init(); return mimeTypes; } /** @see #create(InputStream...) */ public static MimeTypes create(InputStream stream) throws IOException, MimeTypeException { return create(new InputStream[] { stream }); } /** * Creates and returns a MimeTypes instance from the resource * at the location specified by the URL. Opens and closes the * InputStream from the URL. * If multiple URLs are supplied, then they are loaded in turn. * * @throws IOException if the URL can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(URL... urls) throws IOException, MimeTypeException { InputStream[] streams = new InputStream[urls.length]; for(int i=0; i<streams.length; i++) { streams[i] = urls[i].openStream(); } try { return create(streams); } finally { for(InputStream stream : streams) { stream.close(); } } } /** @see #create(URL...) */ public static MimeTypes create(URL url) throws IOException, MimeTypeException { return create(new URL[] { url }); } /** * Creates and returns a MimeTypes instance from the specified file path, * as interpreted by the class loader in getResource(). * * @throws IOException if the file can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(String filePath) throws IOException, MimeTypeException { return create(MimeTypesReader.class.getResource(filePath)); } /** * Creates and returns a MimeTypes instance. The core mimetypes * will be loaded from the specified file path, and any custom * override mimetypes found will loaded afterwards. * The file paths will be interpreted by the class loader in * getResource(). * * @param coreFilePath The main MimeTypes file to load * @param extensionFilePath The name of extension MimeType files to load afterwards * * @throws IOException if the file can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(String coreFilePath, String extensionFilePath) throws IOException, MimeTypeException { // This allows us to replicate class.getResource() when using // the classloader directly String classPrefix = MimeTypesReader.class.getPackage().getName().replace('.', '/') + "/"; ClassLoader cl = MimeTypesReader.class.getClassLoader(); // Get the core URL, and all the extensions URLs URL coreURL = cl.getResource(classPrefix+coreFilePath); List<URL> extensionURLs = Collections.list( cl.getResources(classPrefix+extensionFilePath)); // Swap that into an Array, and process List<URL> urls = new ArrayList<URL>(); urls.add(coreURL); urls.addAll(extensionURLs); return create( urls.toArray(new URL[urls.size()]) ); } }
该类是MimeTypes工厂类,用来获取tika-mimetypes.xml文件提供的mime类型
接下来分析这个MimeTypes类的源码,该类实现了Detector接口,用于检测InputStream的mime类型
public final class MimeTypes implements Detector, Serializable { /** * Serial version UID. */ private static final long serialVersionUID = -1350863170146349036L; /** * Name of the {@link #rootMimeType root} type, application/octet-stream. */ public static final String OCTET_STREAM = "application/octet-stream"; /** * Name of the {@link #textMimeType text} type, text/plain. */ public static final String PLAIN_TEXT = "text/plain"; /** * Name of the {@link #xml xml} type, application/xml. */ public static final String XML = "application/xml"; /** * Root type, application/octet-stream. */ private final MimeType rootMimeType; /** * Text type, text/plain. */ private final MimeType textMimeType; /* * xml type, application/xml */ private final MimeType xmlMimeType; /** * Registered media types and their aliases. */ private final MediaTypeRegistry registry = new MediaTypeRegistry(); /** All the registered MimeTypes indexed on their canonical names */ private final Map<MediaType, MimeType> types = new HashMap<MediaType, MimeType>(); /** The patterns matcher */ private Patterns patterns = new Patterns(registry); /** Sorted list of all registered magics */ private final List<Magic> magics = new ArrayList<Magic>(); /** Sorted list of all registered rootXML */ private final List<MimeType> xmls = new ArrayList<MimeType>(); public MimeTypes() { rootMimeType = new MimeType(MediaType.OCTET_STREAM); textMimeType = new MimeType(MediaType.TEXT_PLAIN); xmlMimeType = new MimeType(MediaType.APPLICATION_XML); add(rootMimeType); add(textMimeType); add(xmlMimeType); } /** * Find the Mime Content Type of a document from its name. * Returns application/octet-stream if no better match is found. * * @deprecated Use {@link Tika#detect(String)} instead * @param name of the document to analyze. * @return the Mime Content Type of the specified document name */ public MimeType getMimeType(String name) { MimeType type = patterns.matches(name); if (type != null) { return type; } type = patterns.matches(name.toLowerCase(Locale.ENGLISH)); if (type != null) { return type; } else { return rootMimeType; } } /** * Find the Mime Content Type of a document stored in the given file. * Returns application/octet-stream if no better match is found. * * @deprecated Use {@link Tika#detect(File)} instead * @param file file to analyze * @return the Mime Content Type of the specified document * @throws MimeTypeException if the type can't be detected * @throws IOException if the file can't be read */ public MimeType getMimeType(File file) throws MimeTypeException, IOException { return forName(new Tika(this).detect(file)); } /** * Returns the MIME type that best matches the given first few bytes * of a document stream. Returns application/octet-stream if no better * match is found. * <p> * The given byte array is expected to be at least {@link #getMinLength()} * long, or shorter only if the document stream itself is shorter. * * @param data first few bytes of a document stream * @return matching MIME type */ private MimeType getMimeType(byte[] data) { if (data == null) { throw new IllegalArgumentException("Data is missing"); } else if (data.length == 0) { // See https://issues.apache.org/jira/browse/TIKA-483 return rootMimeType; } // Then, check for magic bytes MimeType result = null; for (Magic magic : magics) { if (magic.eval(data)) { result = magic.getType(); break; } } if (result != null) { // When detecting generic XML (or possibly XHTML), // extract the root element and match it against known types if ("application/xml".equals(result.getName()) || "text/html".equals(result.getName())) { XmlRootExtractor extractor = new XmlRootExtractor(); QName rootElement = extractor.extractRootElement(data); if (rootElement != null) { for (MimeType type : xmls) { if (type.matchesXML( rootElement.getNamespaceURI(), rootElement.getLocalPart())) { result = type; break; } } } else if ("application/xml".equals(result.getName())) { // Downgrade from application/xml to text/plain since // the document seems not to be well-formed. result = textMimeType; } } return result; } // Finally, assume plain text if no control bytes are found try { TextDetector detector = new TextDetector(getMinLength()); ByteArrayInputStream stream = new ByteArrayInputStream(data); return forName(detector.detect(stream, new Metadata()).toString()); } catch (Exception e) { return rootMimeType; } } /** * Reads the first {@link #getMinLength()} bytes from the given stream. * If the stream is shorter, then the entire content of the stream is * returned. * <p> * The given stream is never {@link InputStream#close() closed}, * {@link InputStream#mark(int) marked}, or * {@link InputStream#reset() reset} by this method. * * @param stream stream to be read * @return first {@link #getMinLength()} (or fewer) bytes of the stream * @throws IOException if the stream can not be read */ private byte[] readMagicHeader(InputStream stream) throws IOException { if (stream == null) { throw new IllegalArgumentException("InputStream is missing"); } byte[] bytes = new byte[getMinLength()]; int totalRead = 0; int lastRead = stream.read(bytes); while (lastRead != -1) { totalRead += lastRead; if (totalRead == bytes.length) { return bytes; } lastRead = stream.read(bytes, totalRead, bytes.length - totalRead); } byte[] shorter = new byte[totalRead]; System.arraycopy(bytes, 0, shorter, 0, totalRead); return shorter; } /** * Returns the registered media type with the given name (or alias). * The named media type is automatically registered (and returned) if * it doesn't already exist. * * @param name media type name (case-insensitive) * @return the registered media type with the given name or alias * @throws MimeTypeException if the given media type name is invalid */ public MimeType forName(String name) throws MimeTypeException { MediaType type = MediaType.parse(name); if (type != null) { MediaType normalisedType = registry.normalize(type); MimeType mime = types.get(normalisedType); if (mime == null) { synchronized (this) { // Double check it didn't already get added while // we were waiting for the lock mime = types.get(normalisedType); if (mime == null) { mime = new MimeType(type); add(mime); types.put(type, mime); } } } return mime; } else { throw new MimeTypeException("Invalid media type name: " + name); } } public synchronized void setSuperType(MimeType type, MediaType parent) { registry.addSuperType(type.getType(), parent); } /** * Adds an alias for the given media type. This method should only * be called from {@link MimeType#addAlias(String)}. * * @param type media type * @param alias media type alias (normalized to lower case) */ synchronized void addAlias(MimeType type, MediaType alias) { registry.addAlias(type.getType(), alias); } /** * Adds a file name pattern for the given media type. Assumes that the * pattern being added is <b>not</b> a JDK standard regular expression. * * @param type * media type * @param pattern * file name pattern * @throws MimeTypeException * if the pattern conflicts with existing ones */ public void addPattern(MimeType type, String pattern) throws MimeTypeException { this.addPattern(type, pattern, false); } /** * Adds a file name pattern for the given media type. The caller can specify * whether the pattern being added <b>is</b> or <b>is not</b> a JDK standard * regular expression via the <code>isRegex</code> parameter. If the value * is set to true, then a JDK standard regex is assumed, otherwise the * freedesktop glob type is assumed. * * @param type * media type * @param pattern * file name pattern * @param isRegex * set to true if JDK std regexs are desired, otherwise set to * false. * @throws MimeTypeException * if the pattern conflicts with existing ones. * */ public void addPattern(MimeType type, String pattern, boolean isRegex) throws MimeTypeException { patterns.add(pattern, isRegex, type); } public MediaTypeRegistry getMediaTypeRegistry() { return registry; } /** * Return the minimum length of data to provide to analyzing methods based * on the document's content in order to check all the known MimeTypes. * * @return the minimum length of data to provide. * @see #getMimeType(byte[]) * @see #getMimeType(String, byte[]) */ public int getMinLength() { // This needs to be reasonably large to be able to correctly detect // things like XML root elements after initial comment and DTDs return 64 * 1024; } /** * Add the specified mime-type in the repository. * * @param type * is the mime-type to add. */ void add(MimeType type) { registry.addType(type.getType()); types.put(type.getType(), type); // Update the magics index... if (type.hasMagic()) { magics.addAll(type.getMagics()); } // Update the xml (xmlRoot) index... if (type.hasRootXML()) { xmls.add(type); } } /** * Called after all configured types have been loaded. * Initializes the magics and xmls sets. */ void init() { for (MimeType type : types.values()) { magics.addAll(type.getMagics()); if (type.hasRootXML()) { xmls.add(type); } } Collections.sort(magics); Collections.sort(xmls); } /** * Automatically detects the MIME type of a document based on magic * markers in the stream prefix and any given metadata hints. * <p> * The given stream is expected to support marks, so that this method * can reset the stream to the position it was in before this method * was called. * * @param input document stream, or <code>null</code> * @param metadata metadata hints * @return MIME type of the document * @throws IOException if the document stream could not be read */ public MediaType detect(InputStream input, Metadata metadata) throws IOException { MediaType type = MediaType.OCTET_STREAM; // Get type based on magic prefix if (input != null) { input.mark(getMinLength()); try { byte[] prefix = readMagicHeader(input); type = getMimeType(prefix).getType(); } finally { input.reset(); } } // Get type based on resourceName hint (if available) String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY); if (resourceName != null) { String name = null; // Deal with a URI or a path name in as the resource name try { URI uri = new URI(resourceName); String path = uri.getPath(); if (path != null) { int slash = path.lastIndexOf('/'); if (slash + 1 < path.length()) { name = path.substring(slash + 1); } } } catch (URISyntaxException e) { name = resourceName; } if (name != null) { MediaType hint = getMimeType(name).getType(); if (registry.isSpecializationOf(hint, type)) { type = hint; } } } // Get type based on metadata hint (if available) String typeName = metadata.get(Metadata.CONTENT_TYPE); if (typeName != null) { try { MediaType hint = forName(typeName).getType(); if (registry.isSpecializationOf(hint, type)) { type = hint; } } catch (MimeTypeException e) { // Malformed type name, ignore } } return type; } private static MimeTypes DEFAULT_TYPES = null; /** * Get the default MimeTypes. This includes all the build in * media types, and any custom override ones present. * * @return MimeTypes default type registry */ public static synchronized MimeTypes getDefaultMimeTypes() { if (DEFAULT_TYPES == null) { try { DEFAULT_TYPES = MimeTypesFactory.create( "tika-mimetypes.xml", "custom-mimetypes.xml"); } catch (MimeTypeException e) { throw new RuntimeException( "Unable to parse the default media type registry", e); } catch (IOException e) { throw new RuntimeException( "Unable to read the default media type registry", e); } } return DEFAULT_TYPES; } }
该类重要的方法是static synchronized MimeTypes getDefaultMimeTypes()方法和MediaType detect(InputStream input, Metadata metadata)方法
前者是获取MimeTypes类型实例,后者是检测mime类型
私有方法
MimeType getMimeType(byte[] data)
byte[] readMagicHeader(InputStream stream)
供MediaType detect(InputStream input, Metadata metadata)内部调用
其他方法如synchronized void setSuperType(MimeType type, MediaType parent)
synchronized void addAlias(MimeType type, MediaType alias)
void addPattern(MimeType type, String pattern, boolean isRegex)
供tika-mimetypes.xml文件解析事件处理类MimeTypesReader的相关方法回调执行
MimeTypes对象持有final MediaTypeRegistry registry = new MediaTypeRegistry()成员变量,在MimeTypesReader对象执行事件处理方法时通过MimeTypes对象实例的相关方法执行对该成员变量的初始化,最后完成对tika-mimetypes.xml文件中提供的mime类型的注册,下面分析它的源码:
/** * Registry of known Internet media types. */ public class MediaTypeRegistry implements Serializable { /** Serial version UID */ private static final long serialVersionUID = 4710974869988895410L; /** * Returns the built-in media type registry included in Tika. * * @since Apache Tika 0.8 * @return default media type registry */ public static MediaTypeRegistry getDefaultRegistry() { return MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry(); } /** * Registry of known media types, including type aliases. A canonical * media type is handled as an identity mapping, while an alias is stored * as a mapping from the alias to the corresponding canonical type. */ private final Map<MediaType, MediaType> registry = new HashMap<MediaType, MediaType>(); /** * Known type inheritance relationships. The mapping is from a media type * to the closest supertype. */ private final Map<MediaType, MediaType> inheritance = new HashMap<MediaType, MediaType>(); /** * Returns the set of all known canonical media types. Type aliases are * not included in the returned set. * * @since Apache Tika 0.8 * @return canonical media types */ public SortedSet<MediaType> getTypes() { return new TreeSet<MediaType>(registry.values()); } /** * Returns the set of known aliases of the given canonical media type. * * @since Apache Tika 0.8 * @param type canonical media type * @return known aliases */ public SortedSet<MediaType> getAliases(MediaType type) { SortedSet<MediaType> aliases = new TreeSet<MediaType>(); for (Map.Entry<MediaType, MediaType> entry : registry.entrySet()) { if (entry.getValue().equals(type) && !entry.getKey().equals(type)) { aliases.add(entry.getKey()); } } return aliases; } public void addType(MediaType type) { registry.put(type, type); } public void addAlias(MediaType type, MediaType alias) { registry.put(alias, type); } public void addSuperType(MediaType type, MediaType supertype) { inheritance.put(type, supertype); } public MediaType normalize(MediaType type) { if (type == null) { return null; } MediaType canonical = registry.get(type.getBaseType()); if (canonical == null) { return type; } else if (type.hasParameters()) { return new MediaType(canonical, type.getParameters()); } else { return canonical; } } /** * Checks whether the given media type a is a specialization of a more * generic type b. Both types should be already normalised. * * @since Apache Tika 0.8 * @param a media type, normalised * @param b suspected supertype, normalised * @return <code>true</code> if b is a supertype of a, * <code>false</code> otherwise */ public boolean isSpecializationOf(MediaType a, MediaType b) { return isInstanceOf(getSupertype(a), b); } /** * Checks whether the given media type equals the given base type or * is a specialization of it. Both types should be already normalised. * * @since Apache Tika 1.2 * @param a media type, normalised * @param b base type, normalised * @return <code>true</code> if b equals a or is a specialization of it, * <code>false</code> otherwise */ public boolean isInstanceOf(MediaType a, MediaType b) { return a != null && (a.equals(b) || isSpecializationOf(a, b)); } /** * Parses and normalises the given media type string and checks whether * the result equals the given base type or is a specialization of it. * The given base type should already be normalised. * * @since Apache Tika 1.2 * @param a media type * @param b base type, normalised * @return <code>true</code> if b equals a or is a specialization of it, * <code>false</code> otherwise */ public boolean isInstanceOf(String a, MediaType b) { return isInstanceOf(normalize(MediaType.parse(a)), b); } /** * Returns the supertype of the given type. If the given type has any * parameters, then the respective base type is returned. Otherwise * built-in heuristics like text/... -> text/plain and * .../...+xml -> application/xml are used in addition to explicit * type inheritance rules read from the media type database. Finally * application/octet-stream is returned for all types for which no other * supertype is known, and the return value for application/octet-stream * is <code>null</code>. * * @since Apache Tika 0.8 * @param type media type * @return supertype, or <code>null</code> for application/octet-stream */ public MediaType getSupertype(MediaType type) { if (type == null) { return null; } else if (type.hasParameters()) { return type.getBaseType(); } else if (inheritance.containsKey(type)) { return inheritance.get(type); } else if (type.getSubtype().endsWith("+xml")) { return MediaType.APPLICATION_XML; } else if (type.getSubtype().endsWith("+zip")) { return MediaType.APPLICATION_ZIP; } else if ("text".equals(type.getType()) && !MediaType.TEXT_PLAIN.equals(type)) { return MediaType.TEXT_PLAIN; } else if (!MediaType.OCTET_STREAM.equals(type)) { return MediaType.OCTET_STREAM; } else { return null; } } }
该类完成对系统提供的mime类型得注册程序,然后提供对第三种mime类型的检测,判断系统是否对其提供支持(MediaType)