1 | |
package de.fuberlin.wiwiss.ng4j.semwebclient; |
2 | |
|
3 | |
import java.io.ByteArrayInputStream; |
4 | |
import java.io.InputStream; |
5 | |
import java.io.IOException; |
6 | |
import java.io.StringReader; |
7 | |
import java.io.StringWriter; |
8 | |
import java.net.HttpURLConnection; |
9 | |
import java.net.MalformedURLException; |
10 | |
import java.net.SocketTimeoutException; |
11 | |
import java.net.URL; |
12 | |
import java.net.URLConnection; |
13 | |
import java.util.ArrayList; |
14 | |
import java.util.Iterator; |
15 | |
import java.util.List; |
16 | |
import java.util.Map; |
17 | |
|
18 | |
import javax.xml.transform.Transformer; |
19 | |
import javax.xml.transform.dom.DOMSource; |
20 | |
import javax.xml.transform.stream.StreamResult; |
21 | |
|
22 | |
import org.cyberneko.html.parsers.DOMParser; |
23 | |
import org.xml.sax.InputSource; |
24 | |
|
25 | |
import org.apache.commons.logging.Log; |
26 | |
import org.apache.commons.logging.LogFactory; |
27 | |
|
28 | |
import com.hp.hpl.jena.rdf.model.Model; |
29 | |
import com.hp.hpl.jena.rdf.model.ModelFactory; |
30 | |
import com.hp.hpl.jena.rdf.model.impl.RDFDefaultErrorHandler; |
31 | |
|
32 | |
import de.fuberlin.wiwiss.ng4j.NamedGraph; |
33 | |
import de.fuberlin.wiwiss.ng4j.NamedGraphSet; |
34 | |
import de.fuberlin.wiwiss.ng4j.NamedGraphSetFactory; |
35 | |
import de.fuberlin.wiwiss.ng4j.impl.NamedGraphImpl; |
36 | |
import de.fuberlin.wiwiss.ng4j.semwebclient.threadutils.Task; |
37 | |
import de.fuberlin.wiwiss.ng4j.semwebclient.threadutils.TaskExecutorBase; |
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
public class DereferencerThread extends TaskExecutorBase { |
49 | |
private HttpURLConnection connection; |
50 | |
|
51 | |
final protected NamedGraphSetFactory ngsFactory; |
52 | 0 | private NamedGraphSet tempNgs = null; |
53 | |
|
54 | 0 | private int maxfilesize = -1; |
55 | |
|
56 | 0 | private boolean enablegrddl = false; |
57 | 0 | private boolean enableRDFa = false; |
58 | 0 | private int connectTimeout = 0; |
59 | 0 | private int readTimeout = 0; |
60 | |
|
61 | |
private URL url; |
62 | |
|
63 | |
private Transformer transformerForRDFa; |
64 | |
|
65 | 0 | private Log log = LogFactory.getLog(DereferencerThread.class); |
66 | |
|
67 | 0 | public DereferencerThread( NamedGraphSetFactory ngsFactory ) { |
68 | 0 | this.ngsFactory = ngsFactory; |
69 | |
|
70 | 0 | setPriority(getPriority() - 1); |
71 | 0 | } |
72 | |
|
73 | |
|
74 | |
|
75 | |
|
76 | |
public Class<?> getTaskType () { |
77 | 0 | return DereferencingTask.class; |
78 | |
} |
79 | |
|
80 | |
|
81 | |
protected void executeTask ( Task task ) { |
82 | 0 | DereferencingResult result = executeTask( (DereferencingTask) task ); |
83 | |
|
84 | 0 | synchronized ( this ) { |
85 | 0 | if ( isStopped() ) |
86 | 0 | return; |
87 | |
|
88 | 0 | ( (DereferencingTask) task ).notifyListeners( result ); |
89 | 0 | } |
90 | 0 | } |
91 | |
|
92 | |
|
93 | |
|
94 | |
|
95 | |
|
96 | |
|
97 | |
|
98 | |
|
99 | |
public synchronized boolean isAvailable() { |
100 | 0 | return !hasTask() && !isStopped(); |
101 | |
} |
102 | |
|
103 | |
|
104 | |
|
105 | |
|
106 | |
|
107 | |
|
108 | |
|
109 | |
|
110 | |
|
111 | |
|
112 | |
public synchronized boolean startDereferencingIfAvailable( |
113 | |
DereferencingTask task) { |
114 | 0 | if (!isAvailable()) { |
115 | 0 | return false; |
116 | |
} |
117 | 0 | startTask( task ); |
118 | 0 | return true; |
119 | |
} |
120 | |
|
121 | |
|
122 | |
|
123 | |
|
124 | |
|
125 | |
|
126 | |
|
127 | |
|
128 | |
|
129 | |
|
130 | |
|
131 | |
|
132 | |
|
133 | |
|
134 | |
private DereferencingResult createErrorResult(DereferencingTask task, int errorCode, |
135 | |
Exception exception, Map<String,List<String>> headerFields ) { |
136 | 0 | return new DereferencingResult(task, errorCode, null, exception, headerFields ); |
137 | |
} |
138 | |
|
139 | |
|
140 | |
|
141 | |
|
142 | |
|
143 | |
|
144 | |
|
145 | |
|
146 | |
|
147 | |
|
148 | |
|
149 | |
private DereferencingResult createNewUrisResult(DereferencingTask task, int errorCode, ArrayList<String> urilist) { |
150 | 0 | return new DereferencingResult(task, errorCode, urilist, connection.getHeaderFields()); |
151 | |
} |
152 | |
|
153 | |
public DereferencingResult executeTask(DereferencingTask task) { |
154 | 0 | DereferencingResult result = null; |
155 | 0 | this.tempNgs = ngsFactory.create(); |
156 | |
try { |
157 | 0 | url = new URL(task.getURI()); |
158 | 0 | } catch (MalformedURLException ex) { |
159 | 0 | return createErrorResult( task, DereferencingResult.STATUS_MALFORMED_URL, ex, null ); |
160 | 0 | } |
161 | |
|
162 | |
try { |
163 | 0 | URLConnection con = url.openConnection(); |
164 | |
|
165 | |
|
166 | |
|
167 | |
|
168 | |
|
169 | |
|
170 | 0 | con.setConnectTimeout( connectTimeout ); |
171 | 0 | con.setReadTimeout( readTimeout ); |
172 | |
|
173 | 0 | if ( task.conditional ) { |
174 | 0 | con.setIfModifiedSince( task.ifModifiedSince ); |
175 | |
} |
176 | |
|
177 | 0 | connection = (HttpURLConnection) con; |
178 | 0 | } catch ( IOException e ) { |
179 | 0 | log.debug( "Creating a connection to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage(), e ); |
180 | 0 | return createErrorResult( task, DereferencingResult.STATUS_UNABLE_TO_CONNECT, e, null ); |
181 | 0 | } |
182 | |
|
183 | 0 | connection.setInstanceFollowRedirects(false); |
184 | 0 | connection.addRequestProperty( |
185 | |
"Accept", |
186 | |
"application/rdf+xml;q=1," |
187 | |
+ "text/xml;q=0.6,text/rdf+n3;q=0.9," |
188 | |
+ "application/octet-stream;q=0.5," |
189 | |
+ "application/xml q=0.5,application/rss+xml;q=0.5," |
190 | |
+ "text/plain; q=0.5,application/x-turtle;q=0.5," |
191 | |
+ "application/x-trig;q=0.5," |
192 | |
+ "application/xhtml+xml;q=0.5, " |
193 | |
+ "text/html;q=0.5" |
194 | |
); |
195 | |
|
196 | |
try { |
197 | 0 | connection.connect(); |
198 | 0 | } catch ( SocketTimeoutException e ) { |
199 | 0 | log.debug( "Connecting to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage() ); |
200 | 0 | connection.disconnect(); |
201 | 0 | connection = null; |
202 | 0 | return createErrorResult( task, DereferencingResult.STATUS_TIMEOUT, e, null ); |
203 | 0 | } catch ( IOException e ) { |
204 | 0 | log.debug( "Connecting to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage(), e ); |
205 | 0 | connection.disconnect(); |
206 | 0 | connection = null; |
207 | 0 | return createErrorResult( task, DereferencingResult.STATUS_UNABLE_TO_CONNECT, e, null ); |
208 | 0 | } catch ( RuntimeException e ) { |
209 | 0 | log.debug( "Connecting to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage() ); |
210 | 0 | connection.disconnect(); |
211 | 0 | connection = null; |
212 | 0 | return createErrorResult( task, DereferencingResult.STATUS_UNABLE_TO_CONNECT, e, null ); |
213 | 0 | } |
214 | |
|
215 | |
try { |
216 | 0 | this.log.debug(this.connection.getResponseCode() + " " + this.url |
217 | |
+ " (" + this.connection.getContentType() + ")"); |
218 | |
|
219 | 0 | if ( (this.connection.getResponseCode() == 301) |
220 | |
|| (this.connection.getResponseCode() == 302) |
221 | |
|| (this.connection.getResponseCode() == 303) ) { |
222 | 0 | String redirectURI = this.connection.getHeaderField("Location"); |
223 | 0 | DereferencingResult r = new DereferencingResult( task, |
224 | |
DereferencingResult.STATUS_REDIRECTED, |
225 | |
redirectURI, |
226 | |
connection.getHeaderFields() ); |
227 | 0 | connection.disconnect(); |
228 | 0 | connection = null; |
229 | 0 | return r; |
230 | |
} |
231 | |
|
232 | 0 | if ( this.connection.getResponseCode() == 304 ) { |
233 | 0 | DereferencingResult r = new DereferencingResult( task, |
234 | |
DereferencingResult.STATUS_UNMODIFIED, |
235 | |
null, |
236 | |
null, |
237 | |
connection.getHeaderFields() ); |
238 | 0 | connection.disconnect(); |
239 | 0 | connection = null; |
240 | 0 | return r; |
241 | |
} |
242 | |
|
243 | 0 | if ( this.connection.getResponseCode() != 200 ) { |
244 | 0 | DereferencingResult r = createErrorResult( task, |
245 | |
DereferencingResult.STATUS_UNABLE_TO_CONNECT, |
246 | |
new Exception("Unexpected response code ("+connection.getResponseCode()+")"), |
247 | |
connection.getHeaderFields() ); |
248 | 0 | connection.disconnect(); |
249 | 0 | connection = null; |
250 | 0 | return r; |
251 | |
} |
252 | |
|
253 | 0 | if ( connection.getContentType() == null ) { |
254 | 0 | DereferencingResult r = createErrorResult( task, |
255 | |
DereferencingResult.STATUS_UNABLE_TO_CONNECT, |
256 | |
new Exception("Unknown content type"), |
257 | |
connection.getHeaderFields() ); |
258 | 0 | connection.disconnect(); |
259 | 0 | connection = null; |
260 | 0 | return r; |
261 | |
} |
262 | |
|
263 | 0 | String lang = setLang(); |
264 | |
try { |
265 | 0 | result = this.parseRdf(task, lang); |
266 | 0 | } catch (Exception ex) { |
267 | 0 | this.log.debug(ex.getMessage()); |
268 | 0 | DereferencingResult r = createErrorResult( task, |
269 | |
DereferencingResult.STATUS_PARSING_FAILED, |
270 | |
ex, |
271 | |
connection.getHeaderFields() ); |
272 | 0 | connection.disconnect(); |
273 | 0 | connection = null; |
274 | 0 | return r; |
275 | 0 | } |
276 | |
|
277 | 0 | } catch ( SocketTimeoutException e ) { |
278 | 0 | log.debug( "Accessing the connection to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage() ); |
279 | 0 | DereferencingResult r = createErrorResult( task, |
280 | |
DereferencingResult.STATUS_TIMEOUT, |
281 | |
e, |
282 | |
null ); |
283 | 0 | connection.disconnect(); |
284 | 0 | connection = null; |
285 | 0 | return r; |
286 | 0 | } catch (IOException e) { |
287 | 0 | log.debug( "Accessing the connection to <" + url.toString() + "> caused a " + e.getClass().getName() + ": " + e.getMessage(), e ); |
288 | 0 | DereferencingResult r = createErrorResult( task, |
289 | |
DereferencingResult.STATUS_UNABLE_TO_CONNECT, |
290 | |
e, |
291 | |
null ); |
292 | 0 | connection.disconnect(); |
293 | 0 | connection = null; |
294 | 0 | return r; |
295 | 0 | } |
296 | |
|
297 | |
|
298 | 0 | connection.disconnect(); |
299 | 0 | connection = null; |
300 | 0 | return result; |
301 | |
} |
302 | |
|
303 | |
|
304 | |
|
305 | |
|
306 | |
private DereferencingResult parseRdf(DereferencingTask task, String lang) throws Exception { |
307 | 0 | if ( (lang != null) |
308 | |
&& (lang.toUpperCase().equals("HTML")) ) { |
309 | |
|
310 | |
|
311 | 0 | String htmlContent = DereferencerThread.readout(this.connection.getInputStream()); |
312 | |
|
313 | 0 | if (this.enablegrddl) { |
314 | 0 | com.hp.hpl.jena.grddl.GRDDLReader r = new com.hp.hpl.jena.grddl.GRDDLReader(); |
315 | |
|
316 | |
|
317 | |
|
318 | |
|
319 | |
|
320 | 0 | Model m = ModelFactory.createDefaultModel(); |
321 | 0 | r.read(m, new ByteArrayInputStream(htmlContent.getBytes()), this.url.toString()); |
322 | 0 | this.tempNgs.addGraph( new NamedGraphImpl(this.url.toString(), |
323 | |
m.getGraph()) ); |
324 | |
|
325 | 0 | if (this.tempNgs.countGraphs() > 0) |
326 | 0 | return new DereferencingResult(task, |
327 | |
DereferencingResult.STATUS_OK, this.tempNgs, null, connection.getHeaderFields()); |
328 | |
} |
329 | |
|
330 | |
|
331 | 0 | ArrayList<String> l = HtmlLinkFetcher.fetchLinks(htmlContent); |
332 | 0 | if ( ! l.isEmpty() ) { |
333 | 0 | Iterator<String> iter = l.iterator(); |
334 | 0 | ArrayList<String> urilist = new ArrayList<String>(); |
335 | 0 | while (iter.hasNext()) { |
336 | 0 | String link = iter.next(); |
337 | 0 | link = link.replace( "&", "&" ); |
338 | 0 | link = link.replace( ">", ">" ); |
339 | 0 | link = link.replace( "<", "<" ); |
340 | |
try { |
341 | 0 | URL newURL = new URL( url, link ); |
342 | 0 | urilist.add( newURL.toString() ); |
343 | 0 | } catch ( MalformedURLException e ) { |
344 | 0 | log.debug( "Creating a URL from the link <" + link + "> fetched for <" + url.toString() + "> caused an exception (" + e.getMessage() + ").", e ); |
345 | 0 | } |
346 | 0 | } |
347 | 0 | return createNewUrisResult(task, DereferencingResult.STATUS_NEW_URIS_FOUND, urilist); |
348 | |
} |
349 | |
|
350 | |
|
351 | 0 | if ( this.enableRDFa ) { |
352 | 0 | log.debug( "Parsing HTML from <" + url.toString() + "> for RDFa" ); |
353 | |
|
354 | |
|
355 | 0 | StringWriter rdfxml = new StringWriter(); |
356 | |
|
357 | |
|
358 | 0 | DOMParser parser = new DOMParser(); |
359 | 0 | parser.setFeature("http://xml.org/sax/features/namespaces", false); |
360 | 0 | parser.setFeature("http://cyberneko.org/html/features/balance-tags", true); |
361 | 0 | parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", true); |
362 | |
|
363 | |
|
364 | 0 | parser.parse( new InputSource(new StringReader(htmlContent)) ); |
365 | |
|
366 | |
|
367 | 0 | transformerForRDFa.transform( new DOMSource(parser.getDocument(), url.toString()), |
368 | |
new StreamResult(rdfxml) ); |
369 | |
|
370 | |
|
371 | 0 | StringReader rdfParserIn = new StringReader(rdfxml.getBuffer().toString()); |
372 | 0 | tempNgs.read(rdfParserIn, "RDF/XML", url.toString()); |
373 | |
|
374 | |
|
375 | |
|
376 | 0 | int triplesCount = 0; |
377 | 0 | Iterator<NamedGraph> graphIt = tempNgs.listGraphs(); |
378 | 0 | while (graphIt.hasNext()) { |
379 | 0 | NamedGraph graph = graphIt.next(); |
380 | 0 | triplesCount += graph.size(); |
381 | 0 | } |
382 | |
|
383 | 0 | if (triplesCount > 0) { |
384 | 0 | log.debug( "Found RDFa in HTML from <" + url.toString() + ">"); |
385 | 0 | return new DereferencingResult( task, |
386 | |
DereferencingResult.STATUS_OK, |
387 | |
this.tempNgs, |
388 | |
null, |
389 | |
connection.getHeaderFields() ); |
390 | |
} |
391 | |
else { |
392 | 0 | log.debug( "No RDFa in HTML from <" + url.toString() + ">"); |
393 | |
} |
394 | |
} |
395 | |
|
396 | 0 | return createNewUrisResult( task, |
397 | |
DereferencingResult.STATUS_NEW_URIS_FOUND, |
398 | |
new ArrayList<String>() ); |
399 | |
} |
400 | |
|
401 | 0 | RDFDefaultErrorHandler.silent = true; |
402 | 0 | LimitedInputStream lis = new LimitedInputStream(this.connection.getInputStream(),this.maxfilesize); |
403 | 0 | this.tempNgs.read(lis, lang, this.url |
404 | |
.toString()); |
405 | 0 | return new DereferencingResult( task, |
406 | |
DereferencingResult.STATUS_OK, |
407 | |
this.tempNgs, |
408 | |
null, |
409 | |
connection.getHeaderFields() ); |
410 | |
} |
411 | |
|
412 | |
|
413 | |
|
414 | |
|
415 | |
|
416 | |
|
417 | |
private String setLang() { |
418 | 0 | String type = this.connection.getContentType(); |
419 | 0 | if (type == null) |
420 | 0 | return null; |
421 | |
|
422 | 0 | if (type.startsWith("application/rdf+xml") |
423 | |
|| type.startsWith("text/xml") |
424 | |
|| type.startsWith("application/xml") |
425 | |
|| type.startsWith("application/rss+xml") |
426 | |
|| type.startsWith("text/plain")) |
427 | 0 | return "RDF/XML"; |
428 | 0 | if (type.startsWith("application/n3") |
429 | |
|| type.startsWith("application/x-turtle") |
430 | |
|| type.startsWith("text/rdf+n3")) |
431 | 0 | return "N3"; |
432 | 0 | if (type.contains("html")) |
433 | 0 | return "HTML"; |
434 | |
|
435 | 0 | return type; |
436 | |
} |
437 | |
|
438 | |
|
439 | |
|
440 | |
|
441 | |
public synchronized void setMaxfilesize(int size){ |
442 | 0 | this.maxfilesize = size; |
443 | 0 | } |
444 | |
public synchronized void setEnableGrddl(boolean g){ |
445 | 0 | this.enablegrddl = g; |
446 | 0 | } |
447 | |
public synchronized void setEnableRDFa(boolean r){ |
448 | 0 | enableRDFa = r; |
449 | 0 | } |
450 | |
public synchronized void setRDFaTransformer(Transformer t){ |
451 | 0 | transformerForRDFa = t; |
452 | 0 | } |
453 | |
public synchronized void setConnectTimeout(int t){ |
454 | 0 | connectTimeout = t; |
455 | 0 | } |
456 | |
public synchronized void setReadTimeout(int t){ |
457 | 0 | readTimeout = t; |
458 | 0 | } |
459 | |
|
460 | |
|
461 | |
|
462 | |
|
463 | |
|
464 | |
|
465 | |
|
466 | |
|
467 | |
|
468 | |
|
469 | |
|
470 | |
|
471 | |
|
472 | |
|
473 | |
|
474 | |
|
475 | |
static public String readout ( InputStream in ) throws IOException { |
476 | 0 | StringBuffer out = new StringBuffer(); |
477 | 0 | byte[] b = new byte[4096]; |
478 | 0 | for (int n; (n = in.read(b)) != -1;) { |
479 | 0 | out.append(new String(b, 0, n)); |
480 | |
} |
481 | 0 | return out.toString(); |
482 | |
} |
483 | |
} |