有这样一段HTML:
希望通过这个XPath提取出Hello:
//div//td[contains(@id, 'foo')]/text()
先导入maven依赖:
net.sourceforge.htmlcleaner
htmlcleaner
2.21
main函数:
package com.my.demo;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
public class HtmlXpathJava {
public static void main(String[] args) {
String sampleHtml = "
String sampleXpath = "//div//td[contains(@id, 'foo')]/text()";
System.out.println(getValueByXpath(sampleXpath, sampleHtml));
}
/**
* Extract value by xPath from HTML.
*/
private static String getValueByXpath(String xPath, String html) {
TagNode tagNode = new HtmlCleaner().clean(html);
String value = null;
try {
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
XPath xpath = XPathFactory.newInstance().newXPath();
value = (String) xpath.evaluate(xPath, doc, XPathConstants.STRING);
} catch (Exception e) {
System.out.println("Extract value error. " + e.getMessage());
e.printStackTrace();
}
return value;
}
}
输出:
Hello
参考: