1 package org.paneris.bibliomania.fti;
2
3 import java.io.BufferedInputStream;
4 import java.io.ByteArrayOutputStream;
5 import java.io.File;
6 import java.io.FileInputStream;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.util.Enumeration;
10 import java.util.Hashtable;
11 import java.util.Properties;
12 import java.util.StringTokenizer;
13 import java.util.Vector;
14
15 import org.melati.util.MelatiRuntimeException;
16 import org.melati.util.PropertiesUtils;
17 import org.melati.util.PropertyException;
18 import org.melati.util.UnexpectedExceptionException;
19
20 import com.sleepycat.db.Database;
21 import com.sleepycat.db.BtreeStats;
22 import com.sleepycat.db.DatabaseConfig;
23 import com.sleepycat.db.DatabaseException;
24 import com.sleepycat.db.Cursor;
25 import com.sleepycat.db.DatabaseEntry;
26 import com.sleepycat.db.LockMode;
27 import com.sleepycat.db.OperationStatus;
28
29 public class IndexOther {
30
31 private static final String USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI = "/usr/local/share/bibliomania/workspace/infoFTI";
32 public static final int wordTruncationLength = 100;
33 public static final int contextWordsBeforeHit = 5;
34
35
36 File dbHome;
37 Database idOfWord;
38 DatabaseConfig idOfWordConfig;
39 DatabaseEntry idOfWordKey = DbUtils.userMemDatabaseEntry(wordTruncationLength);
40 DatabaseEntry idOfWordData = DbUtils.userMemDatabaseEntry(wcBytesLength);
41 int nextWordID = 0;
42 Database occurrencesOfWordInText;
43 Database anchorOfIndex;
44 Database blockmarkOfIndex;
45 Database wordsInText;
46
47 private AnchorFinder blockmarkFinder;
48
49 public static class CorruptionException extends FTIException {
50
51 private static final long serialVersionUID = 6251343503765495522L;
52 public String corruptionProblem;
53
54 public CorruptionException(String corruptionProblem) {
55 this.corruptionProblem = corruptionProblem;
56 }
57
58 public String getMessage() {
59 return "The text index is corrupt, and the " +
60 "index must be rebuilt (problem: " + corruptionProblem + ")";
61 }
62 }
63
64 public IndexOther(File dbHome)
65 throws IOException, DatabaseException, PropertyException {
66 this(dbHome, null);
67 }
68
69 public IndexOther(File dbHome, Properties config)
70 throws IOException, DatabaseException, PropertyException {
71 this.dbHome = dbHome;
72 if (!dbHome.exists())
73 throw new IllegalArgumentException("dbHome `" + dbHome +
74 "' does nto exist");
75 if (!dbHome.isDirectory())
76 throw new IllegalArgumentException("dbHome `" + dbHome +
77 "' is not a directory");
78
79 if (config == null)
80
81 config = PropertiesUtils.fromResource(this.getClass());
82
83 idOfWord = DbUtils.openOrCreateBTreeDb(dbHome, "idOfWord",
84 PropertiesUtils.getOrDie_int(config, "idOfWord.cacheSize"));
85
86 occurrencesOfWordInText = DbUtils.openOrCreateBTreeDb(dbHome, "occurrencesOfWordInText",
87 PropertiesUtils.getOrDie_int(config, "occurrencesOfWordInText.cacheSize"));
88
89 nextWordID = ((BtreeStats)idOfWord.getStats(null,null)).getNumKeys();
90
91
92
93 anchorOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "anchorOfIndex",
94 PropertiesUtils.getOrDie_int(config, "anchorOfIndex.cacheSize"));
95
96 blockmarkOfIndex = DbUtils.openOrCreateBTreeDb(dbHome, "blockmarkOfIndex",
97 PropertiesUtils.getOrDie_int(config, "blockmarkOfIndex.cacheSize"));
98
99
100
101 wordsInText = DbUtils.openOrCreateBTreeDb(dbHome, "wordsInText", 0);
102
103 blockmarkFinder = new AnchorFinder(this, true);
104 }
105
106 private static final class Buffer extends ByteArrayOutputStream {
107 public byte[] buffer() {
108 return buf;
109 }
110
111 public int count() {
112 return count;
113 }
114 }
115
116 public class WordRecord {
117 public String word;
118 public int wordID;
119 public Buffer occurrenceData = new Buffer();
120 public int count;
121
122 public WordRecord(String word) throws FTIException, DatabaseException {
123 this.word = word;
124 wordID = idOfWord(word);
125 }
126
127
128
129 private Packer wordIndexPacker = OnePacker.it;
130 private Packer offsetPacker = OnePacker.it;
131
132 public void noteOccurrence(int wordIndex, int offset) {
133 try {
134 while (wordIndex >= wordIndexPacker.numberMax()) {
135 wordIndexPacker.write(occurrenceData, wordIndexPacker.numberMax());
136 wordIndexPacker = wordIndexPacker.bigger();
137 }
138
139 wordIndexPacker.write(occurrenceData, wordIndex);
140
141 while (offset >= offsetPacker.numberMax()) {
142 offsetPacker.write(occurrenceData, offsetPacker.numberMax());
143 offsetPacker = offsetPacker.bigger();
144 }
145
146 offsetPacker.write(occurrenceData, offset);
147 }
148 catch (IOException e) {
149 throw new UnexpectedExceptionException(e);
150 }
151
152 ++count;
153 }
154 }
155
156 static final int wtBytesLength = 8;
157
158 static int getWT_wordID(byte[] bytes) {
159 return ThreePacker.number_(bytes, 0);
160 }
161
162 static long getWT_textID(byte[] bytes) {
163
164
165
166
167 if (bytes.length == 3 + 5)
168 return FivePacker.number_(bytes, 3);
169 else if (bytes.length == 3 + 4)
170 throw new CorruptionException(
171 "unexpected occurrence key length 7; are you running against an " +
172 "index built with an old version of the bibliomania software, from " +
173 "before the number of allowed `sections' was increased?");
174 else
175 throw new CorruptionException(
176 "unexpected occurrence key length " + bytes.length);
177 }
178
179 static void setWT(byte[] bytes, int wordID, long textID) {
180 ThreePacker.set_(bytes, 0, wordID);
181 FivePacker.set_(bytes, 3, textID);
182 }
183
184 static final int wcBytesLength = 6;
185
186 static int getWC_wordID(byte[] bytes) {
187 return ThreePacker.number_(bytes, 0);
188 }
189
190 static void setWC_count(byte[] bytes, int count) {
191 ThreePacker.set_(bytes, 3, count);
192 }
193
194 static void setWC(byte[] bytes, int wordID, int count) {
195 ThreePacker.set_(bytes, 0, wordID);
196 setWC_count(bytes, count);
197 }
198
199 static int getWC_count(byte[] bytes) {
200 return ThreePacker.number_(bytes, 3);
201 }
202
203 static void setWord(DatabaseEntry key, String word) {
204 byte[] data = key.getData();
205 int length = Math.min(word.length(), wordTruncationLength);
206 if (data.length < length) {
207 key.setData(data = new byte[length]);
208 key.setUserBuffer(length, true);
209 }
210 key.setSize(length);
211 for (int i = 0; i < length; ++i)
212 data[i] = (byte)word.charAt(i);
213 }
214
215 public class WordIDExceededMaxException extends MelatiRuntimeException {
216
217
218
219 private static final long serialVersionUID = 1L;
220
221 public String getMessage() {
222 return "Word dictionary in FTI subsystem " + dbHome +
223 " exceeded maximum size";
224 }
225 }
226
227 public int idOfWord(String word) throws FTIException, DatabaseException {
228 word = word.toLowerCase();
229
230 synchronized (idOfWord) {
231 setWord(idOfWordKey, word);
232 if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS)
233 return getWC_wordID(idOfWordData.getData());
234 else {
235 if (nextWordID > 0xFFFFFF)
236 throw new WordIDExceededMaxException();
237 setWC(idOfWordData.getData(), nextWordID, 0);
238 idOfWord.put(null, idOfWordKey, idOfWordData);
239 return nextWordID++;
240 }
241 }
242 }
243
244 private void addWordCount(String word, int count) throws DatabaseException {
245 synchronized (idOfWord) {
246 setWord(idOfWordKey, word.toLowerCase());
247 if (idOfWord.get(null, idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
248 byte[] data = idOfWordData.getData();
249 setWC_count(data, getWC_count(data) + count);
250 idOfWord.put(null, idOfWordKey, idOfWordData);
251 }
252 }
253 }
254
255 public static final int textIdBytesLength = 8;
256
257 public static void setTI(byte[] bytes, long textID, int index) {
258 FivePacker.set_(bytes, 0, textID);
259 ThreePacker.set_(bytes, 5, index);
260 }
261
262 public static long getTI_textID(byte[] bytes) {
263 return FivePacker.number_(bytes, 0);
264 }
265
266 public void unIndex(long textID)
267 throws FTIException, DatabaseException {
268 DatabaseEntry text = DbUtils.userMemDatabaseEntry(5);
269 FivePacker.set_(text.getData(), 0, textID);
270 DatabaseEntry words = DbUtils.userMemDatabaseEntry(256);
271
272 if (DbUtils.get(wordsInText, text, words, 256) == OperationStatus.SUCCESS) {
273 DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
274 for (int i = 0; i < words.getSize(); i += 3) {
275 setWT(wt.getData(),
276 ThreePacker.number_(words.getData(), i),
277 textID);
278 occurrencesOfWordInText.delete(null, wt);
279 }
280 }
281
282 wordsInText.delete(null, text);
283 }
284
285 private void noteOccurrence(Hashtable<String, WordRecord> wordRecords, String word,
286 int index, int offset) throws DatabaseException {
287
288 WordRecord wordRecord = (WordRecord)wordRecords.get(word);
289 if (wordRecord == null)
290 wordRecords.put(word, wordRecord = new WordRecord(word));
291
292 wordRecord.noteOccurrence(index, offset);
293 }
294
295 public void index(Text text)
296 throws FTIException, IOException, DatabaseException {
297
298 long textID = text.ftiTextID();
299
300 unIndex(textID);
301
302 DatabaseEntry anchorOfIndexKey = DbUtils.userMemDatabaseEntry(textIdBytesLength);
303 DatabaseEntry anchorOfIndexData = DbUtils.userMemDatabaseEntry(256);
304
305 Hashtable<String, WordRecord> wordRecords = new Hashtable<String, WordRecord>();
306 int[] offsetHistory = new int[contextWordsBeforeHit];
307 int wordCount = 0;
308
309 InputStream body = new BufferedInputStream(text.body());
310 try {
311 for (IndexTokenizer words = new IndexTokenizer(body);
312 words.hasMoreWords();) {
313 String word = words.nextWord();
314 if (word.startsWith("#")) {
315 boolean blockmark = word.startsWith("#__");
316 setWord(anchorOfIndexData, word.substring(1));
317 setTI(anchorOfIndexKey.getData(), textID, words.wordIndex());
318 (blockmark ? blockmarkOfIndex : anchorOfIndex).put(
319 null, anchorOfIndexKey, anchorOfIndexData);
320 }
321 else
322 word = word.toLowerCase();
323
324 offsetHistory[wordCount % contextWordsBeforeHit] = words.wordOffset();
325
326 int index = words.wordIndex();
327 int offset = wordCount < contextWordsBeforeHit ?
328 offsetHistory[0] :
329 offsetHistory[(wordCount + 1) % contextWordsBeforeHit];
330
331 noteOccurrence(wordRecords, word, index, offset);
332 if (word.startsWith("$"))
333 noteOccurrence(wordRecords, word.substring(1), index, offset);
334
335 ++wordCount;
336 }
337 }
338 finally {
339 try { body.close(); } catch (IOException e) {}
340 }
341
342 byte[] allWordIDs = new byte[wordRecords.size() * 3];
343 int allWordIDs_p = 0;
344
345 DatabaseEntry occurrences = DbUtils.userMemDatabaseEntry(1);
346 DatabaseEntry wt = DbUtils.userMemDatabaseEntry(wtBytesLength);
347
348 for (Enumeration<WordRecord> w = wordRecords.elements(); w.hasMoreElements();) {
349 WordRecord wordRecord = (WordRecord)w.nextElement();
350 setWT(wt.getData(), wordRecord.wordID, textID);
351 occurrences.setData(wordRecord.occurrenceData.buffer());
352 occurrences.setSize(wordRecord.occurrenceData.count());
353 occurrences.setUserBuffer(occurrences.getData().length,true);
354 occurrencesOfWordInText.put(null, wt, occurrences);
355 addWordCount(wordRecord.word, wordRecord.count);
356
357 ThreePacker.set_(allWordIDs, allWordIDs_p, wordRecord.wordID);
358 allWordIDs_p += 3;
359 }
360
361 DatabaseEntry textIDDatabaseEntry = DbUtils.userMemDatabaseEntry(5);
362 FivePacker.set_(textIDDatabaseEntry.getData(), 0, textID);
363 wordsInText.put(null, textIDDatabaseEntry, DbUtils.userMemDatabaseEntry(allWordIDs));
364 }
365
366 public SearchResults andSearchResults(String[] args)
367 throws FTIException, DatabaseException {
368 return new AndSearchResults(this, args, false);
369 }
370
371 public SearchResults groupSearchResults(String[] args)
372 throws FTIException, DatabaseException {
373 return new AndSearchResults(this, args, true);
374 }
375
376
377
378 public String blockmarkerBeforeFirstOccurrence(long textID, String word)
379 throws DatabaseException {
380 TextStream stream = new TextStream(this, word);
381 stream.gotoText(textID);
382 stream.init();
383 int index = stream.currentWordIndex();
384 return index == -1 ? null : blockmarkFinder.anchorOfIndex(textID, index);
385 }
386
387 public void stat() throws DatabaseException {
388 Cursor words = idOfWord.openCursor(null, null);
389 try {
390 int total = 0;
391 while (words.getNext(idOfWordKey, idOfWordData, LockMode.DEFAULT) == OperationStatus.SUCCESS) {
392 System.out.write(idOfWordKey.getData(), 0, idOfWordKey.getSize());
393 System.out.print(": ");
394 int count = getWC_count(idOfWordData.getData());
395 System.out.println(count);
396 total += count;
397 }
398 System.out.println("--- total " + total);
399 }
400 finally {
401 try { words.close(); } catch (DatabaseException e) {}
402 }
403 }
404
405 public void flush() throws DatabaseException {
406 idOfWord.sync();
407 occurrencesOfWordInText.sync();
408 anchorOfIndex.sync();
409 blockmarkOfIndex.sync();
410 wordsInText.sync();
411 }
412
413 public void close() throws DatabaseException {
414 idOfWord.close(false);
415 occurrencesOfWordInText.close(false);
416 anchorOfIndex.close(false);
417 blockmarkOfIndex.close(false);
418 wordsInText.close(false);
419 }
420
421 protected void finalize() throws Throwable {
422 close();
423 }
424
425 public void appendTerms(Vector<SearchResults> terms, String query, boolean keywords)
426 throws DatabaseException {
427 Vector<String> group = null;
428
429 StringTokenizer tokens = new StringTokenizer(query, " \t\n\r\f\"", true);
430
431 while (tokens.hasMoreTokens()) {
432 String token = tokens.nextToken();
433 switch (token.charAt(0)) {
434 case ' ': case '\t' : case '\n': case '\r': case '\f':
435 break;
436 case '"':
437 if (group == null)
438 group = new Vector<String>();
439 else {
440 String[] g = new String[group.size()];
441 group.copyInto(g);
442 terms.addElement(new AndSearchResults(this, g, true));
443 group = null;
444 }
445 break;
446 default:
447 if (keywords)
448 token = '$' + token;
449
450 if (group != null)
451 group.addElement(token);
452 else
453 terms.addElement(new TextStream(this, token));
454 }
455 }
456
457 if (group != null) {
458 String[] g = new String[group.size()];
459 group.copyInto(g);
460 terms.addElement(new AndSearchResults(this, g, true));
461 }
462 }
463
464 public SearchResults querySearchResults(
465 String textQuery, String keywordsQuery) throws DatabaseException, FTIException {
466 Vector<SearchResults> terms = new Vector<SearchResults>();
467 if (textQuery != null) appendTerms(terms, textQuery, false);
468 if (keywordsQuery != null) appendTerms(terms, keywordsQuery, true);
469 SearchResults[] t = new SearchResults[terms.size()];
470 terms.copyInto(t);
471 return new AndSearchResults(t);
472 }
473
474 public SearchResults querySearchResults(String query)
475 throws DatabaseException, FTIException {
476 return querySearchResults(query, null);
477 }
478
479 public IndexCursor allEntries() throws DatabaseException {
480 return new IndexCursor(this);
481 }
482
483 public static boolean debug = false;
484
485 public static void main(final String[] args) throws Exception {
486 IndexOther index = new IndexOther(
487 new File(USR_LOCAL_SHARE_BIBLIOMANIA_WORKSPACE_INFO_FTI));
488
489 if (args[0].equals("-index"))
490 for (int i = 1; i < args.length; i += 2) {
491 System.err.println("indexing " + args[i + 1]);
492 final String filename = args[i + 1];
493 final long textID = Long.parseLong(args[i]);
494 final InputStream body = new BufferedInputStream(new FileInputStream(
495 filename));
496 index.index(new Text() {
497 public InputStream body() {
498 return body;
499 }
500
501 public InputStream bodyForFragment() {
502 return body;
503 }
504
505 public long ftiTextID() {
506 return textID;
507 }
508 });
509 }
510 else if (args[0].equals("-stat"))
511 index.stat();
512 else if (args[0].equals("-anchorpage"))
513 System.out.println(index.blockmarkerBeforeFirstOccurrence(
514 Integer.parseInt(args[1]), args[2]));
515 else {
516 SearchResults results;
517 if (args[0].charAt(0) == '_') {
518 args[0] = args[0].substring(1);
519 results = index.groupSearchResults(args);
520 System.out.println("phrase");
521 } else if (args[0].equals("-query")) {
522 results = index.querySearchResults(args[1]);
523 } else {
524 results = index.andSearchResults(args);
525 System.out.println("and");
526 }
527
528 for (results.gotoText(0L); results.currentTextID() != -1L;
529
530 results.gotoText(results.currentTextID() + 1L)) {
531 System.out.print("-- " + results.currentTextID() + "\n ");
532 for (; results.currentOffset() != -1; results.skipToNextHit())
533 System.out.print(" " +
534 results.currentOffset());
535 System.out.println();
536 }
537 }
538 index.close();
539 }
540 }