public class HtmlScraper {
/**
* Scrapes an HTML page for <img> tags.
*
* @return Scraped plain text
*/
public static String parseWithImageTags(
String htmlText,
@Nullable String originUrl,
List outImageUrls) {
ExtractImageGetter imageGetter = new ExtractImageGetter(originUrl, outImageUrls);
String strippedText = Html.fromHtml(
htmlText,
imageGetter,
null /* tagHandler */)
.toString();
return strippedText.trim();
}
private static class ExtractImageGetter implements Html.ImageGetter {
@Nullable private final String mOriginUrl;
private final List mSources;
public ExtractImageGetter(@Nullable String originUrl, List outSources) {
mOriginUrl = originUrl;
mSources = outSources;
}
@Override
public Drawable getDrawable(String source) {
if (mOriginUrl != null && TextUtils.isEmpty(Uri.parse(source).getScheme())) {
StringBuilder newSource = new StringBuilder();
newSource.append(mOriginUrl);
if (!mOriginUrl.endsWith("/") && !source.startsWith("/")) {
newSource.append("/");
}
newSource.append(source);
source = newSource.toString();
}
mSources.add(source);
// Dummy drawable.
return new ColorDrawable(Color.TRANSPARENT);
}
public List getSources() {
return mSources;
}
}
}
最后图片将会提取到outImageUrls中,但是,这种效率真的要比正则匹配要好吗?
public void parseSubTree(XmlPullParser pp) throws SAXException, IOException {
this.pp = pp;
final boolean namespaceAware = pp.getFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES);
try {
if(pp.getEventType() != XmlPullParser.START_TAG) {
throw new SAXException(
"start tag must be read before skiping subtree"+pp.getPositionDescription());
}
final int[] holderForStartAndLength = new int[2];
final StringBuilder rawName = new StringBuilder(16);
String prefix = null;
String name = null;
int level = pp.getDepth() - 1;
int type = XmlPullParser.START_TAG;
LOOP:
do {
switch(type) {
case XmlPullParser.START_TAG:
if(namespaceAware) {
final int depth = pp.getDepth() - 1;
final int countPrev =
(level > depth) ? pp.getNamespaceCount(depth) : 0;
//int countPrev = pp.getNamespaceCount(pp.getDepth() - 1);
final int count = pp.getNamespaceCount(depth + 1);
for (int i = countPrev; i < count; i++)
{
contentHandler.startPrefixMapping(
pp.getNamespacePrefix(i),
pp.getNamespaceUri(i)
);
}
name = pp.getName();
prefix = pp.getPrefix();
if(prefix != null) {
rawName.setLength(0);
rawName.append(prefix);
rawName.append(':');
rawName.append(name);
}
startElement(pp.getNamespace(),
name,
// TODO Fixed this. Was "not equals".
prefix == null ? name : rawName.toString());
} else {
startElement(pp.getNamespace(),
pp.getName(),
pp.getName());
}
//++level;
break;
case XmlPullParser.TEXT:
final char[] chars = pp.getTextCharacters(holderForStartAndLength);
contentHandler.characters(chars,
holderForStartAndLength[0], //start
holderForStartAndLength[1] //len
);
break;
case XmlPullParser.END_TAG:
//--level;
if(namespaceAware) {
name = pp.getName();
prefix = pp.getPrefix();
if(prefix != null) {
rawName.setLength(0);
rawName.append(prefix);
rawName.append(':');
rawName.append(name);
}
contentHandler.endElement(pp.getNamespace(),
name,
prefix != null ? name : rawName.toString()
);
// when entering show prefixes for all levels!!!!
final int depth = pp.getDepth();
final int countPrev =
(level > depth) ? pp.getNamespaceCount(pp.getDepth()) : 0;
int count = pp.getNamespaceCount(pp.getDepth() - 1);
// undeclare them in reverse order
for (int i = count - 1; i >= countPrev; i--)
{
contentHandler.endPrefixMapping(
pp.getNamespacePrefix(i)
);
}
} else {
contentHandler.endElement(pp.getNamespace(),
pp.getName(),
pp.getName()
);
}
break;
case XmlPullParser.END_DOCUMENT:
break LOOP;
}
type = pp.next();
} while(pp.getDepth() > level);
} catch (XmlPullParserException ex) {
final SAXParseException saxException = new SAXParseException("parsing error: "+ex, this, ex);
ex.printStackTrace();
errorHandler.fatalError(saxException);
}
}
反正就是去解析xml
private void handleStartTag(String tag, Attributes attributes) {
if (tag.equalsIgnoreCase("br")) {
// We don't need to handle this. TagSoup will ensure that there's a for each
// so we can safely emite the linebreaks when we handle the close tag.
} else if (tag.equalsIgnoreCase("p")) {
handleP(mSpannableStringBuilder);
} else if (tag.equalsIgnoreCase("div")) {
handleP(mSpannableStringBuilder);
} else if (tag.equalsIgnoreCase("strong")) {
start(mSpannableStringBuilder, new Bold());
} else if (tag.equalsIgnoreCase("b")) {
start(mSpannableStringBuilder, new Bold());
} else if (tag.equalsIgnoreCase("em")) {
start(mSpannableStringBuilder, new Italic());
} else if (tag.equalsIgnoreCase("cite")) {
start(mSpannableStringBuilder, new Italic());
} else if (tag.equalsIgnoreCase("dfn")) {
start(mSpannableStringBuilder, new Italic());
} else if (tag.equalsIgnoreCase("i")) {
start(mSpannableStringBuilder, new Italic());
} else if (tag.equalsIgnoreCase("big")) {
start(mSpannableStringBuilder, new Big());
} else if (tag.equalsIgnoreCase("small")) {
start(mSpannableStringBuilder, new Small());
} else if (tag.equalsIgnoreCase("font")) {
startFont(mSpannableStringBuilder, attributes);
} else if (tag.equalsIgnoreCase("blockquote")) {
handleP(mSpannableStringBuilder);
start(mSpannableStringBuilder, new Blockquote());
} else if (tag.equalsIgnoreCase("tt")) {
start(mSpannableStringBuilder, new Monospace());
} else if (tag.equalsIgnoreCase("a")) {
startA(mSpannableStringBuilder, attributes);
} else if (tag.equalsIgnoreCase("u")) {
start(mSpannableStringBuilder, new Underline());
} else if (tag.equalsIgnoreCase("sup")) {
start(mSpannableStringBuilder, new Super());
} else if (tag.equalsIgnoreCase("sub")) {
start(mSpannableStringBuilder, new Sub());
} else if (tag.length() == 2 &&
Character.toLowerCase(tag.charAt(0)) == 'h' &&
tag.charAt(1) >= '1' && tag.charAt(1) <= '6') {
handleP(mSpannableStringBuilder);
start(mSpannableStringBuilder, new Header(tag.charAt(1) - '1'));
} else if (tag.equalsIgnoreCase("img")) {
startImg(mSpannableStringBuilder, attributes, mImageGetter);
} else if (mTagHandler != null) {
mTagHandler.handleTag(true, tag, mSpannableStringBuilder, mReader);
}
}
其实把,如果让我实现,我肯定想不到使用这个方法,我肯定是这么想的,引入一个jquery类似的解析库,
$("").foreach({$0.src})这样的拿出链接了。