Coverage Report - de.fuberlin.wiwiss.ng4j.semwebclient.DereferencerThread
 
Classes in this File Line Coverage Branch Coverage Complexity
DereferencerThread
0%
0/186
0%
0/62
4.588
 
 1  
 package de.fuberlin.wiwiss.ng4j.semwebclient;
 2  
 
 3  
 import java.io.ByteArrayInputStream;
 4  
 import java.io.InputStream;
 5  
 import java.io.IOException;
 6  
 import java.io.StringReader;
 7  
 import java.io.StringWriter;
 8  
 import java.net.HttpURLConnection;
 9  
 import java.net.MalformedURLException;
 10  
 import java.net.SocketTimeoutException;
 11  
 import java.net.URL;
 12  
 import java.net.URLConnection;
 13  
 import java.util.ArrayList;
 14  
 import java.util.Iterator;
 15  
 import java.util.List;
 16  
 import java.util.Map;
 17  
 
 18  
 import javax.xml.transform.Transformer;
 19  
 import javax.xml.transform.dom.DOMSource;
 20  
 import javax.xml.transform.stream.StreamResult;
 21  
 
 22  
 import org.cyberneko.html.parsers.DOMParser;
 23  
 import org.xml.sax.InputSource;
 24  
 
 25  
 import org.apache.commons.logging.Log;
 26  
 import org.apache.commons.logging.LogFactory;
 27  
 
 28  
 import com.hp.hpl.jena.rdf.model.Model;
 29  
 import com.hp.hpl.jena.rdf.model.ModelFactory;
 30  
 import com.hp.hpl.jena.rdf.model.impl.RDFDefaultErrorHandler;
 31  
 
 32  
 import de.fuberlin.wiwiss.ng4j.NamedGraph;
 33  
 import de.fuberlin.wiwiss.ng4j.NamedGraphSet;
 34  
 import de.fuberlin.wiwiss.ng4j.NamedGraphSetFactory;
 35  
 import de.fuberlin.wiwiss.ng4j.impl.NamedGraphImpl;
 36  
 import de.fuberlin.wiwiss.ng4j.semwebclient.threadutils.Task;
 37  
 import de.fuberlin.wiwiss.ng4j.semwebclient.threadutils.TaskExecutorBase;
 38  
 
 39  
 /**
 40  
  * The DereferencerThread executes a given DereferencingTask. It opens a
 41  
  * HttpURLConnection, creates an InputStream and tries to parse it. If the
 42  
  * Thread is finished it delivers the retrieval result.
 43  
  * 
 44  
  * @author Tobias Gauß
 45  
  * @author Olaf Hartig
 46  
  * @author Hannes Mühleisen
 47  
  */
 48  
 public class DereferencerThread extends TaskExecutorBase {
 49  
         private HttpURLConnection connection;
 50  
 
 51  
         final protected NamedGraphSetFactory ngsFactory;
 52  0
         private NamedGraphSet tempNgs = null;
 53  
         
 54  0
         private int maxfilesize = -1;
 55  
 
 56  0
         private boolean enablegrddl = false;
 57  0
         private boolean enableRDFa = false;
 58  0
         private int connectTimeout = 0;
 59  0
         private int readTimeout = 0;
 60  
 
 61  
         private URL url;
 62  
 
 63  
         private Transformer transformerForRDFa;
 64  
 
 65  0
         private Log log = LogFactory.getLog(DereferencerThread.class);
 66  
 
 67  0
         public DereferencerThread( NamedGraphSetFactory ngsFactory ) {
 68  0
                 this.ngsFactory = ngsFactory;
 69  
                 // Lower priority a little bit
 70  0
                 setPriority(getPriority() - 1);
 71  0
         }
 72  
 
 73  
 
 74  
         // implementation of the TaskExecutorBase interface
 75  
 
 76  
         public Class<?> getTaskType () {
 77  0
                 return DereferencingTask.class;
 78  
         }
 79  
 
 80  
 
 81  
         protected void executeTask ( Task task ) {
 82  0
                 DereferencingResult result = executeTask( (DereferencingTask) task );
 83  
                 // deliver the result of the task to the listeners
 84  0
                 synchronized ( this ) {
 85  0
                         if ( isStopped() )
 86  0
                                 return;
 87  
 
 88  0
                         ( (DereferencingTask) task ).notifyListeners( result );
 89  0
                 }
 90  0
         }
 91  
 
 92  
 
 93  
         // methods kept for compatibility
 94  
 
 95  
         /**
 96  
          * @return Returns true if the DereferencerThread is available for new
 97  
          *         tasks.
 98  
          */
 99  
         public synchronized boolean isAvailable() {
 100  0
                 return !hasTask() && !isStopped();
 101  
         }
 102  
 
 103  
         /**
 104  
          * Starts to execute the DereferencingTask task. Returns true if the
 105  
          * retrieval process is started false if the thread is unable to execute the
 106  
          * task.
 107  
          * @deprecated Please use {@link TaskExecutorBase#startTask} instead.
 108  
          * 
 109  
          * @param task
 110  
          *            The task to execute.
 111  
          */
 112  
         public synchronized boolean startDereferencingIfAvailable(
 113  
                         DereferencingTask task) {
 114  0
                 if (!isAvailable()) {
 115  0
                         return false;
 116  
                 }
 117  0
                 startTask( task );
 118  0
                 return true;
 119  
         }
 120  
 
 121  
 
 122  
         // helper methods
 123  
 
 124  
         /**
 125  
          * Creates a new DereferencingResult which contains information about the
 126  
          * retrieval failure.
 127  
          * 
 128  
          * @param errorCode
 129  
          *            the error code
 130  
          * @param exception
 131  
          *            the thrown exception
 132  
          * @return
 133  
          */
 134  
         private DereferencingResult createErrorResult(DereferencingTask task, int errorCode,
 135  
                         Exception exception, Map<String,List<String>> headerFields ) {
 136  0
                 return new DereferencingResult(task, errorCode, null, exception, headerFields );
 137  
         }
 138  
 
 139  
         /**
 140  
          * Creates a new DereferencingResult which contains information about the
 141  
          * retrieval failure.
 142  
          * 
 143  
          * @param errorCode
 144  
          *            the error code
 145  
          * @param exception
 146  
          *            the thrown exception
 147  
          * @return
 148  
          */
 149  
         private DereferencingResult createNewUrisResult(DereferencingTask task, int errorCode, ArrayList<String> urilist) {
 150  0
                 return new DereferencingResult(task, errorCode, urilist, connection.getHeaderFields());
 151  
         }        
 152  
 
 153  
         public DereferencingResult executeTask(DereferencingTask task) {
 154  0
                 DereferencingResult result = null;
 155  0
                 this.tempNgs = ngsFactory.create();
 156  
                 try {
 157  0
                         url = new URL(task.getURI());
 158  0
                 } catch (MalformedURLException ex) {
 159  0
                         return createErrorResult( task, DereferencingResult.STATUS_MALFORMED_URL, ex, null );
 160  0
                 }
 161  
 
 162  
                 try {
 163  0
                         URLConnection con = url.openConnection();
 164  
 
 165  
 // TODO This works only with Java 5,
 166  
 // and Tobias said he's not even sure if it has any positive effect. [RC]
 167  
 //                        con.setReadTimeout(60000);
 168  
 
 169  
 // It does (at least for me?) [Olaf]
 170  0
                         con.setConnectTimeout( connectTimeout );
 171  0
                         con.setReadTimeout( readTimeout );
 172  
 
 173  0
                         if ( task.conditional ) {
 174  0
                                 con.setIfModifiedSince( task.ifModifiedSince );
 175  
                         }
 176  
 
 177  0
                         connection = (HttpURLConnection) con;
 178  0
                 } catch ( IOException e ) {
 179  0
                         log.debug( "Creating a connection to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage(), e );
 180  0
                         return createErrorResult( task, DereferencingResult.STATUS_UNABLE_TO_CONNECT, e, null );
 181  0
                 }
 182  
 
 183  0
                 connection.setInstanceFollowRedirects(false);
 184  0
                 connection.addRequestProperty(
 185  
                                                         "Accept",
 186  
                                                         "application/rdf+xml;q=1,"
 187  
                                                         + "text/xml;q=0.6,text/rdf+n3;q=0.9,"
 188  
                                                         + "application/octet-stream;q=0.5,"
 189  
                                                         + "application/xml q=0.5,application/rss+xml;q=0.5,"
 190  
                                                         + "text/plain; q=0.5,application/x-turtle;q=0.5,"
 191  
                                                         + "application/x-trig;q=0.5,"
 192  
                                                         + "application/xhtml+xml;q=0.5, "
 193  
                                                         + "text/html;q=0.5"
 194  
                                                         );
 195  
 
 196  
                 try {
 197  0
                         connection.connect();
 198  0
                 } catch ( SocketTimeoutException e ) {
 199  0
                         log.debug( "Connecting to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage() );
 200  0
                         connection.disconnect();
 201  0
                         connection = null;
 202  0
                         return createErrorResult( task, DereferencingResult.STATUS_TIMEOUT, e, null );
 203  0
                 } catch ( IOException e ) {
 204  0
                         log.debug( "Connecting to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage(), e );
 205  0
                         connection.disconnect();
 206  0
                         connection = null;
 207  0
                         return createErrorResult( task, DereferencingResult.STATUS_UNABLE_TO_CONNECT, e, null );
 208  0
                 } catch ( RuntimeException e ) {
 209  0
                         log.debug( "Connecting to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage() );
 210  0
                         connection.disconnect();
 211  0
                         connection = null;
 212  0
                         return createErrorResult( task, DereferencingResult.STATUS_UNABLE_TO_CONNECT, e, null );
 213  0
                 }
 214  
 
 215  
                 try {
 216  0
                         this.log.debug(this.connection.getResponseCode() + " " + this.url
 217  
                                        + " (" + this.connection.getContentType() + ")");
 218  
 
 219  0
                         if (    (this.connection.getResponseCode() == 301)
 220  
                              || (this.connection.getResponseCode() == 302)
 221  
                              || (this.connection.getResponseCode() == 303) ) {
 222  0
                                 String redirectURI = this.connection.getHeaderField("Location");
 223  0
                                 DereferencingResult r = new DereferencingResult( task,
 224  
                                                                                  DereferencingResult.STATUS_REDIRECTED,
 225  
                                                                                  redirectURI,
 226  
                                                                                  connection.getHeaderFields() );
 227  0
                                 connection.disconnect();
 228  0
                                 connection = null;
 229  0
                                 return r;
 230  
                         }
 231  
 
 232  0
                         if ( this.connection.getResponseCode() == 304 ) {
 233  0
                                 DereferencingResult r = new DereferencingResult( task,
 234  
                                                                                  DereferencingResult.STATUS_UNMODIFIED,
 235  
                                                                                  null,
 236  
                                                                                  null,
 237  
                                                                                  connection.getHeaderFields() );
 238  0
                                 connection.disconnect();
 239  0
                                 connection = null;
 240  0
                                 return r;
 241  
                         }
 242  
 
 243  0
                         if ( this.connection.getResponseCode() != 200 ) {
 244  0
                                 DereferencingResult r = createErrorResult( task,
 245  
                                                                            DereferencingResult.STATUS_UNABLE_TO_CONNECT,
 246  
                                                                            new Exception("Unexpected response code ("+connection.getResponseCode()+")"),
 247  
                                                                            connection.getHeaderFields() );
 248  0
                                 connection.disconnect();
 249  0
                                 connection = null;
 250  0
                                 return r;
 251  
                         }
 252  
 
 253  0
                         if ( connection.getContentType() == null ) {
 254  0
                                 DereferencingResult r = createErrorResult( task,
 255  
                                                                            DereferencingResult.STATUS_UNABLE_TO_CONNECT,
 256  
                                                                            new Exception("Unknown content type"),
 257  
                                                                            connection.getHeaderFields() );
 258  0
                                 connection.disconnect();
 259  0
                                 connection = null;
 260  0
                                 return r;
 261  
                         }
 262  
 
 263  0
                         String lang = setLang();
 264  
                         try {
 265  0
                                 result = this.parseRdf(task, lang);
 266  0
                         } catch (Exception ex) { // parse error
 267  0
                                 this.log.debug(ex.getMessage());
 268  0
                                 DereferencingResult r = createErrorResult( task,
 269  
                                                                            DereferencingResult.STATUS_PARSING_FAILED,
 270  
                                                                            ex,
 271  
                                                                            connection.getHeaderFields() );
 272  0
                                 connection.disconnect();
 273  0
                                 connection = null;
 274  0
                                 return r;
 275  0
                         }
 276  
                         // }
 277  0
                 } catch ( SocketTimeoutException e ) {
 278  0
                         log.debug( "Accessing the connection to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage() );
 279  0
                         DereferencingResult r = createErrorResult( task,
 280  
                                                                    DereferencingResult.STATUS_TIMEOUT,
 281  
                                                                    e,
 282  
                                                                    null );
 283  0
                         connection.disconnect();
 284  0
                         connection = null;
 285  0
                         return r;
 286  0
                 } catch (IOException e) {
 287  0
                         log.debug( "Accessing the connection to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage(), e );
 288  0
                         DereferencingResult r = createErrorResult( task,
 289  
                                                                    DereferencingResult.STATUS_UNABLE_TO_CONNECT,
 290  
                                                                    e,
 291  
                                                                    null );
 292  0
                         connection.disconnect();
 293  0
                         connection = null;
 294  0
                         return r;
 295  0
                 }
 296  
                 //return new DereferencingResult(this.task,
 297  
                 //                DereferencingResult.STATUS_OK, this.tempNgs, null);
 298  0
                 connection.disconnect();
 299  0
                 connection = null;
 300  0
                 return result;
 301  
         }
 302  
 
 303  
         /**
 304  
          * Parses an RDF String.
 305  
          */
 306  
         private DereferencingResult parseRdf(DereferencingTask task, String lang) throws Exception {
 307  0
                 if (    (lang != null)
 308  
                      && (lang.toUpperCase().equals("HTML")) ) {
 309  
 
 310  
                         // read input stream into a string, so it can be reused
 311  0
                         String htmlContent = DereferencerThread.readout(this.connection.getInputStream());
 312  
 
 313  0
                         if (this.enablegrddl) {
 314  0
                             com.hp.hpl.jena.grddl.GRDDLReader r = new com.hp.hpl.jena.grddl.GRDDLReader();
 315  
                             /*
 316  
                             Gleaner g = new Gleaner(this.connection.getURL().toString(),
 317  
                                                     this.connection.getInputStream());
 318  
                             g.glean(this.tempNgs);
 319  
                             */
 320  0
                             Model m = ModelFactory.createDefaultModel();
 321  0
                             r.read(m, new ByteArrayInputStream(htmlContent.getBytes()), this.url.toString());
 322  0
                             this.tempNgs.addGraph( new NamedGraphImpl(this.url.toString(), 
 323  
                                                                       m.getGraph()) );
 324  
 
 325  0
                             if (this.tempNgs.countGraphs() > 0)
 326  0
                                 return new DereferencingResult(task,
 327  
                                                                DereferencingResult.STATUS_OK, this.tempNgs, null, connection.getHeaderFields());
 328  
                         }
 329  
 
 330  
                         // parse the HTML for references to alternative representations
 331  0
                         ArrayList<String> l = HtmlLinkFetcher.fetchLinks(htmlContent);
 332  0
                         if ( ! l.isEmpty() ) {
 333  0
                                 Iterator<String> iter = l.iterator();
 334  0
                                 ArrayList<String> urilist = new ArrayList<String>();
 335  0
                                 while (iter.hasNext()) {
 336  0
                                         String link = iter.next();
 337  0
                                         link = link.replace( "&amp;", "&" );
 338  0
                                         link = link.replace( "&gt;", ">" );
 339  0
                                         link = link.replace( "&lt;", "<" );
 340  
                                         try {
 341  0
                                                 URL newURL = new URL( url, link );
 342  0
                                                 urilist.add( newURL.toString() );
 343  0
                                         } catch ( MalformedURLException e ) {
 344  0
                                                 log.debug( "Creating a URL from the link <" + link + "> fetched for <" + url.toString() + "> caused an exception (" + e.getMessage() + ").", e );
 345  0
                                         }
 346  0
                                 }
 347  0
                                 return createNewUrisResult(task, DereferencingResult.STATUS_NEW_URIS_FOUND, urilist);
 348  
                         }
 349  
 
 350  
 
 351  0
                         if ( this.enableRDFa ) {
 352  0
                                 log.debug( "Parsing HTML from <" + url.toString() + "> for RDFa" );
 353  
 
 354  
                                 // RDF/XML output buffer
 355  0
                                 StringWriter rdfxml = new StringWriter();
 356  
 
 357  
                                 // Uses nekoHTML Parser
 358  0
                                 DOMParser parser = new DOMParser();
 359  0
                                 parser.setFeature("http://xml.org/sax/features/namespaces", false);
 360  0
                                 parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
 361  0
                                 parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", true);
 362  
 
 363  
                                 // error-tolerant parsing with nekoHTML
 364  0
                                 parser.parse( new InputSource(new StringReader(htmlContent)) );
 365  
 
 366  
                                 // perform XSLT transformation from RDFa into RDF/XML
 367  0
                                 transformerForRDFa.transform( new DOMSource(parser.getDocument(), url.toString()),
 368  
                                                               new StreamResult(rdfxml) );
 369  
 
 370  
                                 // parse RDF/XML into triples and return
 371  0
                                 StringReader rdfParserIn = new StringReader(rdfxml.getBuffer().toString());
 372  0
                                 tempNgs.read(rdfParserIn, "RDF/XML", url.toString());
 373  
 
 374  
                                 // Count the number of extracted triples. If there are any we can
 375  
                                 // return a result. Otherwise we have to proceed below.
 376  0
                                 int triplesCount = 0;
 377  0
                                 Iterator<NamedGraph> graphIt = tempNgs.listGraphs();
 378  0
                                 while (graphIt.hasNext()) {
 379  0
                                         NamedGraph graph = graphIt.next();
 380  0
                                         triplesCount += graph.size();
 381  0
                                 }
 382  
 
 383  0
                                 if (triplesCount > 0) {
 384  0
                                         log.debug( "Found RDFa in HTML from <" + url.toString() + ">");
 385  0
                                         return new DereferencingResult( task,
 386  
                                                                         DereferencingResult.STATUS_OK,
 387  
                                                                         this.tempNgs,
 388  
                                                                         null,
 389  
                                                                         connection.getHeaderFields() );
 390  
                                 }
 391  
                                 else {
 392  0
                                         log.debug( "No RDFa in HTML from <" + url.toString() + ">");
 393  
                                 }
 394  
                         }
 395  
 
 396  0
                         return createNewUrisResult( task,
 397  
                                                     DereferencingResult.STATUS_NEW_URIS_FOUND,
 398  
                                                     new ArrayList<String>() );
 399  
                 }
 400  
                         
 401  0
                 RDFDefaultErrorHandler.silent = true;
 402  0
                 LimitedInputStream lis = new LimitedInputStream(this.connection.getInputStream(),this.maxfilesize);
 403  0
                 this.tempNgs.read(lis, lang, this.url
 404  
                                 .toString());
 405  0
                 return new DereferencingResult( task,
 406  
                                                 DereferencingResult.STATUS_OK,
 407  
                                                 this.tempNgs,
 408  
                                                 null, // no exception
 409  
                                                 connection.getHeaderFields() );
 410  
         }
 411  
 
 412  
         /**
 413  
          * Tries to guess a lang String from a connection.
 414  
          * 
 415  
          * @return
 416  
          */
 417  
         private String setLang() {
 418  0
                 String type = this.connection.getContentType();
 419  0
                 if (type == null)
 420  0
                         return null;
 421  
 
 422  0
                 if (type.startsWith("application/rdf+xml")
 423  
                                 || type.startsWith("text/xml")
 424  
                                 || type.startsWith("application/xml")
 425  
                                 || type.startsWith("application/rss+xml")
 426  
                                 || type.startsWith("text/plain"))
 427  0
                         return "RDF/XML";
 428  0
                 if (type.startsWith("application/n3")
 429  
                                 || type.startsWith("application/x-turtle")
 430  
                                 || type.startsWith("text/rdf+n3"))
 431  0
                         return "N3";
 432  0
                 if (type.contains("html"))
 433  0
                         return "HTML";
 434  
 
 435  0
                 return type;
 436  
         }
 437  
 
 438  
 
 439  
         // accessor methods
 440  
 
 441  
         public synchronized void setMaxfilesize(int size){
 442  0
                 this.maxfilesize = size;
 443  0
         }
 444  
         public synchronized void setEnableGrddl(boolean g){
 445  0
                 this.enablegrddl = g;
 446  0
         }
 447  
         public synchronized void setEnableRDFa(boolean r){
 448  0
                 enableRDFa = r;
 449  0
         }
 450  
         public synchronized void setRDFaTransformer(Transformer t){
 451  0
                 transformerForRDFa = t;
 452  0
         }
 453  
         public synchronized void setConnectTimeout(int t){
 454  0
                 connectTimeout = t;
 455  0
         }
 456  
         public synchronized void setReadTimeout(int t){
 457  0
                 readTimeout = t;
 458  0
         }
 459  
 
 460  
         /*
 461  
          * public void parseHTML(){ String site = null; try{ InputStream stream =
 462  
          * this.connection.getInputStream(); BufferedReader br = new
 463  
          * BufferedReader(new InputStreamReader(stream)); StringBuffer sb = new
 464  
          * StringBuffer(); String line = null;
 465  
          * 
 466  
          * while ((line = br.readLine()) != null) { sb.append(line + "\n"); } site =
 467  
          * sb.toString(); Pattern p = Pattern.compile("<link\\s*rel"); Matcher
 468  
          * match = p.matcher(site); int start = match.start(1); int end =
 469  
          * match.end(1);
 470  
          * 
 471  
          * }catch(Exception e){ System.out.println(e.getLocalizedMessage()); }
 472  
          *  }
 473  
          */
 474  
 
 475  
         static public String readout ( InputStream in ) throws IOException {
 476  0
                 StringBuffer out = new StringBuffer();
 477  0
                 byte[] b = new byte[4096];
 478  0
                 for (int n; (n = in.read(b)) != -1;) {
 479  0
                         out.append(new String(b, 0, n));
 480  
                 }
 481  0
                 return out.toString();
 482  
         }
 483  
 }