Warning: WP_Syntax::substituteToken(): Argument #1 ($match) must be passed by reference, value given in /home/mstoynov/public_html_mihail.stoynov.com/wp-content/plugins/wp-syntax/wp-syntax.php on line 380
Warning: WP_Syntax::substituteToken(): Argument #1 ($match) must be passed by reference, value given in /home/mstoynov/public_html_mihail.stoynov.com/wp-content/plugins/wp-syntax/wp-syntax.php on line 380
FYI: Eclipse Luna (4.4) is currently beta.
Create new Maven Project.
Open the pom.xml
Add the following dependencies:
<dependency> <groupId>net.sourceforge.htmlcleaner</groupId> <artifactId>htmlcleaner</artifactId> <version>2.6</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.3.4</version> </dependency> |
The code is as follows:
package sty.qainjava.xpath.on.html; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.URL; import java.nio.charset.Charset; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.entity.ContentType; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.SimpleHtmlSerializer; import org.htmlcleaner.TagNode; import org.w3c.dom.Document; /** * QAinJava: how to do an xpath on html in java. * * We use <a href="http://htmlcleaner.sourceforge.net/">HtmlCleaner</a> * and <a href="https://hc.apache.org/">HttpClient</a>. * * @author Mihail STY */ public class Program { /** * We're not using any methods so that the source code is as straight * forward as possible. * * No exception handling at all for simplicity */ public static void main(String[] args) throws IOException, ParserConfigurationException, XPathExpressionException, TransformerException { String address = "https://www.google.com/"; String html; { // the httpclient part CloseableHttpClient httpclient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(address); CloseableHttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharset(); BufferedReader r = new BufferedReader(new InputStreamReader( entity.getContent(), charset)); // we can directly plug the input to HtmlCleaner, // but we put it in a string so we can print it, // or save it to a file String line = null; StringBuilder builder = new StringBuilder(); while ((line = r.readLine()) != null) { builder.append(line); } html = builder.toString(); } {// write html to a file BufferedWriter bf = new BufferedWriter(new OutputStreamWriter( new FileOutputStream("google.html.xml"))); bf.write(html); // exception handling is not exceptionally good, but that's not our // focus here bf.flush(); bf.close(); } // HtmlCleaner part TagNode tagNode = new HtmlCleaner().clean(html); String cleanHtml = new SimpleHtmlSerializer(new CleanerProperties()) .getAsString(tagNode); // System.out.println(cleanHtml); {// write cleanHtml to a file BufferedWriter bf = new BufferedWriter(new OutputStreamWriter( new FileOutputStream("clean.html.xml"))); bf.write(cleanHtml); // exception handling is not exceptionally good, but that's not our // focus here bf.flush(); bf.close(); } // we need a DOM document to execute xpath, HtmlCleaner helps in creating one Document doc = new DomSerializer(new CleanerProperties()) .createDOM(tagNode); {// save dom to file with a transformer (just for testing purposes) TransformerFactory factory = TransformerFactory.newInstance(); Transformer transformer = factory.newTransformer(); transformer.transform(new DOMSource(doc), new StreamResult( new File("dom.html.xml"))); } // the xpath part XPath xpath = XPathFactory.newInstance().newXPath(); String imgURL = (String) xpath.evaluate("//img/@src", doc, XPathConstants.STRING); //using two URLs we can make sure we get the absolute URL even if relative. System.out.println(new URL(new URL(address), imgURL).toString()); } } |


