FYI: Eclipse Luna (4.4) is currently beta.
Create new Maven Project.
Open the pom.xml
Add the following dependencies:
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.4</version>
</dependency> |
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.4</version>
</dependency>
The code is as follows:
package sty.qainjava.xpath.on.html;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.nio.charset.Charset;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
/**
* QAinJava: how to do an xpath on html in java.
*
* We use <a href="http://htmlcleaner.sourceforge.net/">HtmlCleaner</a>
* and <a href="https://hc.apache.org/">HttpClient</a>.
*
* @author Mihail STY
*/
public class Program {
/**
* We're not using any methods so that the source code is as straight
* forward as possible.
*
* No exception handling at all for simplicity
*/
public static void main(String[] args) throws IOException,
ParserConfigurationException, XPathExpressionException,
TransformerException {
String address = "https://www.google.com/";
String html;
{
// the httpclient part
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(address);
CloseableHttpResponse response = httpclient.execute(httpGet);
HttpEntity entity = response.getEntity();
ContentType contentType = ContentType.getOrDefault(entity);
Charset charset = contentType.getCharset();
BufferedReader r = new BufferedReader(new InputStreamReader(
entity.getContent(), charset));
// we can directly plug the input to HtmlCleaner,
// but we put it in a string so we can print it,
// or save it to a file
String line = null;
StringBuilder builder = new StringBuilder();
while ((line = r.readLine()) != null) {
builder.append(line);
}
html = builder.toString();
}
{// write html to a file
BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("google.html.xml")));
bf.write(html);
// exception handling is not exceptionally good, but that's not our
// focus here
bf.flush();
bf.close();
}
// HtmlCleaner part
TagNode tagNode = new HtmlCleaner().clean(html);
String cleanHtml = new SimpleHtmlSerializer(new CleanerProperties())
.getAsString(tagNode);
// System.out.println(cleanHtml);
{// write cleanHtml to a file
BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("clean.html.xml")));
bf.write(cleanHtml);
// exception handling is not exceptionally good, but that's not our
// focus here
bf.flush();
bf.close();
}
// we need a DOM document to execute xpath, HtmlCleaner helps in creating one
Document doc = new DomSerializer(new CleanerProperties())
.createDOM(tagNode);
{// save dom to file with a transformer (just for testing purposes)
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer();
transformer.transform(new DOMSource(doc), new StreamResult(
new File("dom.html.xml")));
}
// the xpath part
XPath xpath = XPathFactory.newInstance().newXPath();
String imgURL = (String) xpath.evaluate("//img/@src", doc,
XPathConstants.STRING);
//using two URLs we can make sure we get the absolute URL even if relative.
System.out.println(new URL(new URL(address), imgURL).toString());
}
} |
package sty.qainjava.xpath.on.html;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.nio.charset.Charset;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
/**
* QAinJava: how to do an xpath on html in java.
*
* We use <a href="http://htmlcleaner.sourceforge.net/">HtmlCleaner</a>
* and <a href="https://hc.apache.org/">HttpClient</a>.
*
* @author Mihail STY
*/
public class Program {
/**
* We're not using any methods so that the source code is as straight
* forward as possible.
*
* No exception handling at all for simplicity
*/
public static void main(String[] args) throws IOException,
ParserConfigurationException, XPathExpressionException,
TransformerException {
String address = "https://www.google.com/";
String html;
{
// the httpclient part
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(address);
CloseableHttpResponse response = httpclient.execute(httpGet);
HttpEntity entity = response.getEntity();
ContentType contentType = ContentType.getOrDefault(entity);
Charset charset = contentType.getCharset();
BufferedReader r = new BufferedReader(new InputStreamReader(
entity.getContent(), charset));
// we can directly plug the input to HtmlCleaner,
// but we put it in a string so we can print it,
// or save it to a file
String line = null;
StringBuilder builder = new StringBuilder();
while ((line = r.readLine()) != null) {
builder.append(line);
}
html = builder.toString();
}
{// write html to a file
BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("google.html.xml")));
bf.write(html);
// exception handling is not exceptionally good, but that's not our
// focus here
bf.flush();
bf.close();
}
// HtmlCleaner part
TagNode tagNode = new HtmlCleaner().clean(html);
String cleanHtml = new SimpleHtmlSerializer(new CleanerProperties())
.getAsString(tagNode);
// System.out.println(cleanHtml);
{// write cleanHtml to a file
BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream("clean.html.xml")));
bf.write(cleanHtml);
// exception handling is not exceptionally good, but that's not our
// focus here
bf.flush();
bf.close();
}
// we need a DOM document to execute xpath, HtmlCleaner helps in creating one
Document doc = new DomSerializer(new CleanerProperties())
.createDOM(tagNode);
{// save dom to file with a transformer (just for testing purposes)
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer();
transformer.transform(new DOMSource(doc), new StreamResult(
new File("dom.html.xml")));
}
// the xpath part
XPath xpath = XPathFactory.newInstance().newXPath();
String imgURL = (String) xpath.evaluate("//img/@src", doc,
XPathConstants.STRING);
//using two URLs we can make sure we get the absolute URL even if relative.
System.out.println(new URL(new URL(address), imgURL).toString());
}
}