Execute xpath queries on html downloaded from a url with java using eclipse luna

FYI: Eclipse Luna (4.4) is currently beta.

Create new Maven Project.

Open the pom.xml

Add the following dependencies:

    <dependency>
      <groupId>net.sourceforge.htmlcleaner</groupId>
      <artifactId>htmlcleaner</artifactId>
      <version>2.6</version>
    </dependency>
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.3.4</version>
    </dependency>

The code is as follows:

package sty.qainjava.xpath.on.html;
 
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.nio.charset.Charset;
 
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
 
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
 
/**
 * QAinJava: how to do an xpath on html in java.
 * 
 * We use <a href="http://htmlcleaner.sourceforge.net/">HtmlCleaner</a>
 * and <a href="https://hc.apache.org/">HttpClient</a>.
 * 
 * @author Mihail STY
 */
public class Program {
	/**
	 * We're not using any methods so that the source code is as straight
	 * forward as possible.
	 * 
	 * No exception handling at all for simplicity
	 */
	public static void main(String[] args) throws IOException,
	        ParserConfigurationException, XPathExpressionException,
	        TransformerException {
 
		String address = "https://www.google.com/";
 
		String html;
 
 
		{
			// the httpclient part
			CloseableHttpClient httpclient = HttpClients.createDefault();
			HttpGet httpGet = new HttpGet(address);
			CloseableHttpResponse response = httpclient.execute(httpGet);
			HttpEntity entity = response.getEntity();
 
			ContentType contentType = ContentType.getOrDefault(entity);
			Charset charset = contentType.getCharset();
 
			BufferedReader r = new BufferedReader(new InputStreamReader(
			        entity.getContent(), charset));
 
			// we can directly plug the input to HtmlCleaner,
			// but we put it in a string so we can print it,
			// or save it to a file
			String line = null;
			StringBuilder builder = new StringBuilder();
			while ((line = r.readLine()) != null) {
				builder.append(line);
			}
			html = builder.toString();
		}
 
		{// write html to a file
			BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(
			        new FileOutputStream("google.html.xml")));
			bf.write(html);
			// exception handling is not exceptionally good, but that's not our
			// focus here
			bf.flush();
			bf.close();
		}
 
		// HtmlCleaner part
		TagNode tagNode = new HtmlCleaner().clean(html);
		String cleanHtml = new SimpleHtmlSerializer(new CleanerProperties())
		        .getAsString(tagNode);
		// System.out.println(cleanHtml);
 
		{// write cleanHtml to a file
			BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(
			        new FileOutputStream("clean.html.xml")));
			bf.write(cleanHtml);
			// exception handling is not exceptionally good, but that's not our
			// focus here
			bf.flush();
			bf.close();
		}
 
		// we need a DOM document to execute xpath, HtmlCleaner helps in creating one
		Document doc = new DomSerializer(new CleanerProperties())
		        .createDOM(tagNode);
 
		{// save dom to file with a transformer (just for testing purposes)
			TransformerFactory factory = TransformerFactory.newInstance();
			Transformer transformer = factory.newTransformer();
			transformer.transform(new DOMSource(doc), new StreamResult(
			        new File("dom.html.xml")));
		}
 
		// the xpath part
		XPath xpath = XPathFactory.newInstance().newXPath();
		String imgURL = (String) xpath.evaluate("//img/@src", doc,
		        XPathConstants.STRING);
 
		//using two URLs we can make sure we get the absolute URL even if relative.
		System.out.println(new URL(new URL(address), imgURL).toString());
	}
}

Leave a Reply

Your email address will not be published. Required fields are marked *

Notify me of followup comments via e-mail. You can also subscribe without commenting.

This site uses Akismet to reduce spam. Learn how your comment data is processed.