1 package org.paneris.bibliomania.fti;
2
3 import java.io.BufferedInputStream;
4 import java.io.EOFException;
5 import java.io.FileInputStream;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.util.Enumeration;
9 import java.util.NoSuchElementException;
10
11 import org.melati.util.UnexpectedExceptionException;
12
13 public class IndexTokenizer implements Enumeration {
14
15 private InputStream reader;
16 private int pushedBack = -1;
17 private int offset = -1, wordIndex = -1, wordOffset;
18 private StringBuffer buffer = new StringBuffer();
19 private String nextWord = null;
20 private boolean hadBreak = false;
21 private boolean lastWasAnchor = false;
22 private boolean inStrong = false;
23 private boolean inScript = false;
24 private boolean wantEverything;
25
26 public IndexTokenizer(InputStream reader, boolean wantEverything) {
27 this.reader = reader;
28 this.wantEverything = wantEverything;
29 }
30
31 public IndexTokenizer(InputStream reader) {
32 this(reader, false);
33 }
34
35 private int nextChar() throws IOException {
36 int c;
37 if (pushedBack == -1) {
38 c = reader.read();
39 if (c == -1)
40 throw new EOFException();
41 } else {
42 c = pushedBack;
43 pushedBack = -1;
44 }
45
46 ++offset;
47 return c;
48 }
49
50 private void pushback(int c) {
51 if (pushedBack != -1)
52 throw new IllegalArgumentException("pushed back already");
53 pushedBack = c;
54 --offset;
55 }
56
57 private void string(int term) throws IOException {
58 int c;
59 do {
60 if ((c = nextChar()) == '\\') {
61 nextChar();
62 c = nextChar();
63 }
64 } while (c != term);
65 }
66
67 private void comment() throws IOException {
68 for (;;) {
69 int c;
70 if ((c = nextChar()) == '-')
71 if ((c = nextChar()) == '-' && (c = nextChar()) == '>')
72 break;
73 else
74 pushback(c);
75 }
76 }
77
78 private String tag() throws IOException {
79 int tagOffset = offset;
80 String anchorName = null;
81 int c;
82 if ((c = nextChar()) == '!'
83 && (c = nextChar()) == '-'
84 && (c = nextChar()) == '-')
85 comment();
86 else {
87 boolean sense;
88 if (c == '/') {
89 sense = false;
90 c = nextChar();
91 } else
92 sense = true;
93
94 boolean isA = false;
95
96 while (Character.isWhitespace((char)c))
97 c = nextChar();
98
99 boolean lastWhite = false;
100
101 inStrong = false;
102
103 if (c == 'a' || c == 'A') {
104 if (Character.isWhitespace((char) (c = nextChar()))) {
105 isA = sense;
106 lastWhite = true;
107 }
108 } else if (c == 's' || c == 'S') {
109 c = nextChar();
110 if ((c == 't' || c == 'T')
111 && ((c = nextChar()) == 'r' || c == 'R')
112 && ((c = nextChar()) == 'o' || c == 'O')
113 && ((c = nextChar()) == 'n' || c == 'N')
114 && ((c = nextChar()) == 'g' || c == 'G'))
115 inStrong = sense;
116 else if (
117 (c == 'c' || c == 'C')
118 && ((c = nextChar()) == 'r' || c == 'R')
119 && ((c = nextChar()) == 'i' || c == 'I')
120 && ((c = nextChar()) == 'p' || c == 'P')
121 && ((c = nextChar()) == 't' || c == 'T'))
122 inScript = sense;
123 }
124
125 for (; c != '>'; c = nextChar())
126 if (c == '"' || c == '\'')
127 string(c);
128 else if (isA) {
129 if (lastWhite
130 && (c == 'n' || c == 'N')
131 && ((c = nextChar()) == 'a' || c == 'A')
132 && ((c = nextChar()) == 'm' || c == 'M')
133 && ((c = nextChar()) == 'e' || c == 'E')
134 && ((c = nextChar()) == '=')) {
135 StringBuffer anchorNameBuf = new StringBuffer();
136 if ((c = nextChar()) == '"' || c == '\'') {
137 int term = c;
138 while ((c = nextChar()) != term)
139 anchorNameBuf.append((char)c);
140 } else {
141 for (;
142 Character.isLetterOrDigit((char)c) || c == '_';
143 c = nextChar())
144 anchorNameBuf.append((char)c);
145 pushback(c);
146 }
147
148 anchorName = anchorNameBuf.toString();
149 } else
150 lastWhite = Character.isWhitespace((char)c);
151 }
152 }
153
154 if (anchorName != null)
155 wordOffset = tagOffset;
156 return anchorName;
157 }
158
159 private void element() throws IOException {
160 int c;
161 while (Character.isLetterOrDigit((char) (c = nextChar())));
162 if (c != ';')
163 pushback(c);
164 }
165
166 private static boolean isPrint(char c) {
167 return !Character.isWhitespace(c)
168 && !Character.isISOControl(c)
169 && c != '<'
170 && c != '&';
171 }
172
173 private boolean isInteresting(char c) {
174 return wantEverything ? isPrint(c) : Character.isLetter(c);
175 }
176
177 private boolean isInterestingNonLetter(char c) {
178 return isPrint(c) && !Character.isLetter(c);
179 }
180
181 private String _nextWord() throws IOException {
182 try {
183 int c;
184
185 hadBreak = lastWasAnchor;
186 while (!isInteresting((char) (c = nextChar())) || inScript) {
187 hadBreak = true;
188 if (c == '<') {
189 String anchorName = tag();
190 if (anchorName != null) {
191
192 lastWasAnchor = true;
193 return "#" + anchorName;
194 }
195 } else if (c == '&')
196 element();
197 }
198
199 lastWasAnchor = false;
200 wordOffset = offset;
201
202 buffer.setLength(1);
203 buffer.setCharAt(0, (char)c);
204
205 if (Character.isLetter((char)c)) {
206 while (Character.isLetter((char) (c = nextChar())))
207 buffer.append((char)c);
208 ++wordIndex;
209 } else
210 while (isInterestingNonLetter((char) (c = nextChar())))
211 buffer.append((char)c);
212
213 pushback(c);
214
215 String word = buffer.toString();
216
217 if (buffer.capacity() > 1000)
218 buffer = new StringBuffer();
219
220 return inStrong && !wantEverything ? "$" + word : word;
221 } catch (EOFException e) {
222 return null;
223 }
224 }
225
226 public boolean hadBreak() {
227 return hadBreak;
228 }
229
230 public int wordOffset() {
231 return wordOffset;
232 }
233
234 public int bytesReadFromUnderlyingStream() {
235 return offset + (pushedBack == -1 ? 1 : 2);
236 }
237
238 public int wordIndex() {
239 return wordIndex;
240 }
241
242 public synchronized String nextWord() {
243 if (!hasMoreElements())
244 throw new NoSuchElementException();
245
246 try {
247 return nextWord;
248 } finally {
249 nextWord = null;
250 }
251 }
252
253 public final Object nextElement() {
254 return nextWord();
255 }
256
257 public synchronized boolean hasMoreWords() throws IOException {
258 return nextWord != null || (nextWord = _nextWord()) != null;
259 }
260
261 public final boolean hasMoreElements() {
262 try {
263 return hasMoreWords();
264 } catch (IOException e) {
265 throw new UnexpectedExceptionException(e);
266 }
267 }
268
269 public static void main(String[] args) throws Exception {
270 if (args[0].equals("-context")) {
271 for (IndexTokenizer words =
272 new IndexTokenizer(
273 new BufferedInputStream(new FileInputStream(args[1])),
274 true);
275 words.hasMoreWords();
276 ) {
277 String word = words.nextWord();
278 if (!word.startsWith("#")) {
279 if (words.hadBreak())
280 System.out.print(" ");
281 System.out.print(word);
282 }
283 }
284 System.out.println();
285 } else {
286 for (IndexTokenizer words =
287 new IndexTokenizer(
288 new BufferedInputStream(new FileInputStream(args[0])),
289 args[1].equals("all"));
290 words.hasMoreWords();
291 )
292 System.out.print(
293 words.nextWord()
294 + "("
295 + words.wordOffset()
296 + ":"
297 + words.wordIndex()
298 + ") ");
299 System.out.println();
300 }
301 }
302 }