1
2 package com.quiotix.html.parser;
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 public class HtmlParser implements HtmlParserConstants {
27
28 final static String NL = System.getProperty("line.separator");
29
30 private static String getTokenText(Token first, Token cur) {
31 Token t;
32 StringBuffer sb = new StringBuffer();
33
34 for (t=first; t != cur.next; t = t.next) {
35 if (t.specialToken != null) {
36 Token tt=t.specialToken;
37 while (tt.specialToken != null)
38 tt = tt.specialToken;
39 for (; tt != null; tt = tt.next)
40 sb.append(tt.image);
41 };
42 sb.append(t.image);
43 };
44 return sb.toString();
45 }
46
47
48 public static void main(String[] args) throws ParseException {
49 HtmlParser parser = new HtmlParser(System.in);
50 HtmlDocument doc = parser.HtmlDocument();
51 doc.accept(new HtmlDumper(System.out));
52 System.exit(0);
53 }
54
55
56 final public HtmlDocument HtmlDocument() throws ParseException {
57 HtmlDocument.ElementSequence s;
58 s = ElementSequence();
59 jj_consume_token(0);
60 {if (true) return new HtmlDocument(s);}
61 throw new Error("Missing return statement in function");
62 }
63
64
65 final public HtmlDocument.ElementSequence ElementSequence() throws ParseException {
66 HtmlDocument.ElementSequence s = new HtmlDocument.ElementSequence();
67 HtmlDocument.HtmlElement h;
68 label_1:
69 while (true) {
70 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
71 case EOL:
72 case TAG_START:
73 case ENDTAG_START:
74 case COMMENT_START:
75 case DECL_START:
76 case PCDATA:
77 ;
78 break;
79 default:
80 jj_la1[0] = jj_gen;
81 break label_1;
82 }
83 h = Element();
84 s.addElement(h);
85 }
86 {if (true) return s;}
87 throw new Error("Missing return statement in function");
88 }
89
90
91 final public HtmlDocument.HtmlElement Element() throws ParseException {
92 HtmlDocument.HtmlElement e;
93 Token text;
94 if (jj_2_1(2)) {
95 e = Tag();
96 {if (true) return e;}
97 } else {
98 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
99 case ENDTAG_START:
100 e = EndTag();
101 {if (true) return e;}
102 break;
103 case COMMENT_START:
104 e = CommentTag();
105 {if (true) return e;}
106 break;
107 case DECL_START:
108 e = DeclTag();
109 {if (true) return e;}
110 break;
111 default:
112 jj_la1[1] = jj_gen;
113 if (jj_2_2(2)) {
114 e = ScriptBlock();
115 {if (true) return e;}
116 } else if (jj_2_3(2)) {
117 e = StyleBlock();
118 {if (true) return e;}
119 } else if (jj_2_4(2)) {
120 jj_consume_token(TAG_START);
121 text = jj_consume_token(LST_ERROR);
122 {if (true) return new HtmlDocument.Text("<" + text.image);}
123 } else {
124 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
125 case PCDATA:
126 text = jj_consume_token(PCDATA);
127 {if (true) return new HtmlDocument.Text(text.image);}
128 break;
129 case EOL:
130 jj_consume_token(EOL);
131 {if (true) return new HtmlDocument.Newline();}
132 break;
133 default:
134 jj_la1[2] = jj_gen;
135 jj_consume_token(-1);
136 throw new ParseException();
137 }
138 }
139 }
140 }
141 throw new Error("Missing return statement in function");
142 }
143
144
145 final public HtmlDocument.Attribute Attribute() throws ParseException {
146 Token t1, t2=null;
147 t1 = jj_consume_token(ATTR_NAME);
148 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
149 case ATTR_EQ:
150 jj_consume_token(ATTR_EQ);
151 t2 = jj_consume_token(ATTR_VAL);
152 break;
153 default:
154 jj_la1[3] = jj_gen;
155 ;
156 }
157 if (t2 == null)
158 {if (true) return new HtmlDocument.Attribute(t1.image);}
159 else
160 {if (true) return new HtmlDocument.Attribute(t1.image, t2.image);}
161 throw new Error("Missing return statement in function");
162 }
163
164
165 final public HtmlDocument.AttributeList AttributeList() throws ParseException {
166 HtmlDocument.AttributeList alist = new HtmlDocument.AttributeList();
167 HtmlDocument.Attribute a;
168 label_2:
169 while (true) {
170 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
171 case ATTR_NAME:
172 ;
173 break;
174 default:
175 jj_la1[4] = jj_gen;
176 break label_2;
177 }
178 a = Attribute();
179 alist.addAttribute(a);
180 }
181 {if (true) return alist;}
182 throw new Error("Missing return statement in function");
183 }
184
185
186 final public HtmlDocument.HtmlElement Tag() throws ParseException {
187 Token t, et;
188 HtmlDocument.AttributeList alist;
189 Token firstToken = getToken(1);
190 try {
191 jj_consume_token(TAG_START);
192 t = jj_consume_token(TAG_NAME);
193 alist = AttributeList();
194 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
195 case TAG_END:
196 et = jj_consume_token(TAG_END);
197 break;
198 case TAG_SLASHEND:
199 et = jj_consume_token(TAG_SLASHEND);
200 break;
201 default:
202 jj_la1[5] = jj_gen;
203 jj_consume_token(-1);
204 throw new ParseException();
205 }
206 HtmlDocument.Tag tag = new HtmlDocument.Tag(t.image, alist);
207 if (et.kind == TAG_SLASHEND) tag.setEmpty(true);
208 {if (true) return tag;}
209 } catch (ParseException ex) {
210 token_source.SwitchTo(DEFAULT);
211 String s = getTokenText(firstToken, getNextToken());
212 {if (true) return new HtmlDocument.Text(s);}
213 }
214 throw new Error("Missing return statement in function");
215 }
216
217
218 final public HtmlDocument.ElementSequence BlockContents() throws ParseException {
219 Token t;
220 StringBuffer s = new StringBuffer();
221 HtmlDocument.ElementSequence e = new HtmlDocument.ElementSequence();
222 label_3:
223 while (true) {
224 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
225 case BLOCK_EOL:
226 case BLOCK_LBR:
227 case BLOCK_WORD:
228 ;
229 break;
230 default:
231 jj_la1[6] = jj_gen;
232 break label_3;
233 }
234 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
235 case BLOCK_EOL:
236 jj_consume_token(BLOCK_EOL);
237 if (s.length() > 0) {
238 e.addElement(new HtmlDocument.Text(s.toString()));
239 s.setLength(0);
240 };
241 e.addElement(new HtmlDocument.Newline());
242 break;
243 case BLOCK_WORD:
244 t = jj_consume_token(BLOCK_WORD);
245 s.append(t.image);
246 break;
247 case BLOCK_LBR:
248 t = jj_consume_token(BLOCK_LBR);
249 s.append(t.image);
250 break;
251 default:
252 jj_la1[7] = jj_gen;
253 jj_consume_token(-1);
254 throw new ParseException();
255 }
256 }
257 if (s.length() > 0)
258 e.addElement(new HtmlDocument.Text(s.toString()));
259 e.addElement(new HtmlDocument.Newline());
260 {if (true) return e;}
261 throw new Error("Missing return statement in function");
262 }
263
264
265 final public HtmlDocument.HtmlElement ScriptBlock() throws ParseException {
266 HtmlDocument.AttributeList alist;
267 HtmlDocument.ElementSequence e;
268 Token firstToken = getToken(1);
269 try {
270 jj_consume_token(TAG_START);
271 jj_consume_token(TAG_SCRIPT);
272 alist = AttributeList();
273 jj_consume_token(TAG_END);
274 token_source.SwitchTo(LexScript);
275 e = BlockContents();
276 jj_consume_token(SCRIPT_END);
277 {if (true) return new HtmlDocument.TagBlock("SCRIPT", alist, e);}
278 } catch (ParseException ex) {
279 token_source.SwitchTo(DEFAULT);
280 String s = getTokenText(firstToken, getNextToken());
281 {if (true) return new HtmlDocument.Text(s);}
282 }
283 throw new Error("Missing return statement in function");
284 }
285
286
287 final public HtmlDocument.HtmlElement StyleBlock() throws ParseException {
288 HtmlDocument.AttributeList alist;
289 HtmlDocument.ElementSequence e;
290 Token firstToken = getToken(1);
291 try {
292 jj_consume_token(TAG_START);
293 jj_consume_token(TAG_STYLE);
294 alist = AttributeList();
295 jj_consume_token(TAG_END);
296 token_source.SwitchTo(LexStyle);
297 e = BlockContents();
298 jj_consume_token(STYLE_END);
299 {if (true) return new HtmlDocument.TagBlock("STYLE", alist, e);}
300 } catch (ParseException ex) {
301 token_source.SwitchTo(DEFAULT);
302 String s = getTokenText(firstToken, getNextToken());
303 {if (true) return new HtmlDocument.Text(s);}
304 }
305 throw new Error("Missing return statement in function");
306 }
307
308
309 final public HtmlDocument.HtmlElement EndTag() throws ParseException {
310 Token t;
311 Token firstToken = getToken(1);
312 try {
313 jj_consume_token(ENDTAG_START);
314 t = jj_consume_token(TAG_NAME);
315 jj_consume_token(TAG_END);
316 {if (true) return new HtmlDocument.EndTag(t.image);}
317 } catch (ParseException ex) {
318 token_source.SwitchTo(DEFAULT);
319 String s = getTokenText(firstToken, getNextToken());
320 {if (true) return new HtmlDocument.Text(s);}
321 }
322 throw new Error("Missing return statement in function");
323 }
324
325
326 final public HtmlDocument.Comment CommentTag() throws ParseException {
327 Token t;
328 StringBuffer s = new StringBuffer("--");
329 jj_consume_token(COMMENT_START);
330 label_4:
331 while (true) {
332 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
333 case DASH:
334 case COMMENT_EOL:
335 case COMMENT_WORD:
336 ;
337 break;
338 default:
339 jj_la1[8] = jj_gen;
340 break label_4;
341 }
342 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
343 case DASH:
344 t = jj_consume_token(DASH);
345 s.append(t.image);
346 break;
347 case COMMENT_EOL:
348 jj_consume_token(COMMENT_EOL);
349 s.append(NL);
350 break;
351 case COMMENT_WORD:
352 t = jj_consume_token(COMMENT_WORD);
353 s.append(t.image);
354 break;
355 default:
356 jj_la1[9] = jj_gen;
357 jj_consume_token(-1);
358 throw new ParseException();
359 }
360 }
361 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
362 case 0:
363 jj_consume_token(0);
364 break;
365 case COMMENT_END:
366 jj_consume_token(COMMENT_END);
367 break;
368 default:
369 jj_la1[10] = jj_gen;
370 jj_consume_token(-1);
371 throw new ParseException();
372 }
373 {if (true) return new HtmlDocument.Comment(s.append("--").toString());}
374 throw new Error("Missing return statement in function");
375 }
376
377
378 final public HtmlDocument.Comment DeclTag() throws ParseException {
379 Token t;
380 jj_consume_token(DECL_START);
381 t = jj_consume_token(DECL_ANY);
382 jj_consume_token(DECL_END);
383 {if (true) return new HtmlDocument.Comment(t.image);}
384 throw new Error("Missing return statement in function");
385 }
386
387 private boolean jj_2_1(int xla) {
388 jj_la = xla; jj_lastpos = jj_scanpos = token;
389 try { return !jj_3_1(); }
390 catch(LookaheadSuccess ls) { return true; }
391 finally { jj_save(0, xla); }
392 }
393
394 private boolean jj_2_2(int xla) {
395 jj_la = xla; jj_lastpos = jj_scanpos = token;
396 try { return !jj_3_2(); }
397 catch(LookaheadSuccess ls) { return true; }
398 finally { jj_save(1, xla); }
399 }
400
401 private boolean jj_2_3(int xla) {
402 jj_la = xla; jj_lastpos = jj_scanpos = token;
403 try { return !jj_3_3(); }
404 catch(LookaheadSuccess ls) { return true; }
405 finally { jj_save(2, xla); }
406 }
407
408 private boolean jj_2_4(int xla) {
409 jj_la = xla; jj_lastpos = jj_scanpos = token;
410 try { return !jj_3_4(); }
411 catch(LookaheadSuccess ls) { return true; }
412 finally { jj_save(3, xla); }
413 }
414
415 private boolean jj_3_1() {
416 if (jj_3R_5()) return true;
417 return false;
418 }
419
420 private boolean jj_3R_6() {
421 if (jj_scan_token(TAG_START)) return true;
422 if (jj_scan_token(TAG_SCRIPT)) return true;
423 return false;
424 }
425
426 private boolean jj_3_4() {
427 if (jj_scan_token(TAG_START)) return true;
428 if (jj_scan_token(LST_ERROR)) return true;
429 return false;
430 }
431
432 private boolean jj_3_3() {
433 if (jj_3R_7()) return true;
434 return false;
435 }
436
437 private boolean jj_3_2() {
438 if (jj_3R_6()) return true;
439 return false;
440 }
441
442 private boolean jj_3R_7() {
443 if (jj_scan_token(TAG_START)) return true;
444 if (jj_scan_token(TAG_STYLE)) return true;
445 return false;
446 }
447
448 private boolean jj_3R_5() {
449 if (jj_scan_token(TAG_START)) return true;
450 if (jj_scan_token(TAG_NAME)) return true;
451 return false;
452 }
453
454
455 public HtmlParserTokenManager token_source;
456 SimpleCharStream jj_input_stream;
457
458 public Token token;
459
460 public Token jj_nt;
461 private int jj_ntk;
462 private Token jj_scanpos, jj_lastpos;
463 private int jj_la;
464
465 public boolean lookingAhead = false;
466
467
468 private int jj_gen;
469 final private int[] jj_la1 = new int[11];
470 static private int[] jj_la1_0;
471 static private int[] jj_la1_1;
472 static {
473 jj_la1_init_0();
474 jj_la1_init_1();
475 }
476 private static void jj_la1_init_0() {
477 jj_la1_0 = new int[] {0xfc000,0x70000,0x84000,0x10000000,0x2000000,0xc000000,0x0,0x0,0x0,0x0,0x1,};
478 }
479 private static void jj_la1_init_1() {
480 jj_la1_1 = new int[] {0x0,0x0,0x0,0x0,0x0,0x0,0x1c00,0x1c00,0x38,0x38,0x4,};
481 }
482 final private JJCalls[] jj_2_rtns = new JJCalls[4];
483 private boolean jj_rescan = false;
484 private int jj_gc = 0;
485
486
487 public HtmlParser(java.io.InputStream stream) {
488 this(stream, null);
489 }
490
491 public HtmlParser(java.io.InputStream stream, String encoding) {
492 try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
493 token_source = new HtmlParserTokenManager(jj_input_stream);
494 token = new Token();
495 jj_ntk = -1;
496 jj_gen = 0;
497 for (int i = 0; i < 11; i++) jj_la1[i] = -1;
498 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
499 }
500
501
502 public void ReInit(java.io.InputStream stream) {
503 ReInit(stream, null);
504 }
505
506 public void ReInit(java.io.InputStream stream, String encoding) {
507 try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
508 token_source.ReInit(jj_input_stream);
509 token = new Token();
510 jj_ntk = -1;
511 jj_gen = 0;
512 for (int i = 0; i < 11; i++) jj_la1[i] = -1;
513 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
514 }
515
516
517 public HtmlParser(java.io.Reader stream) {
518 jj_input_stream = new SimpleCharStream(stream, 1, 1);
519 token_source = new HtmlParserTokenManager(jj_input_stream);
520 token = new Token();
521 jj_ntk = -1;
522 jj_gen = 0;
523 for (int i = 0; i < 11; i++) jj_la1[i] = -1;
524 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
525 }
526
527
528 public void ReInit(java.io.Reader stream) {
529 jj_input_stream.ReInit(stream, 1, 1);
530 token_source.ReInit(jj_input_stream);
531 token = new Token();
532 jj_ntk = -1;
533 jj_gen = 0;
534 for (int i = 0; i < 11; i++) jj_la1[i] = -1;
535 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
536 }
537
538
539 public HtmlParser(HtmlParserTokenManager tm) {
540 token_source = tm;
541 token = new Token();
542 jj_ntk = -1;
543 jj_gen = 0;
544 for (int i = 0; i < 11; i++) jj_la1[i] = -1;
545 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
546 }
547
548
549 public void ReInit(HtmlParserTokenManager tm) {
550 token_source = tm;
551 token = new Token();
552 jj_ntk = -1;
553 jj_gen = 0;
554 for (int i = 0; i < 11; i++) jj_la1[i] = -1;
555 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
556 }
557
558 private Token jj_consume_token(int kind) throws ParseException {
559 Token oldToken;
560 if ((oldToken = token).next != null) token = token.next;
561 else token = token.next = token_source.getNextToken();
562 jj_ntk = -1;
563 if (token.kind == kind) {
564 jj_gen++;
565 if (++jj_gc > 100) {
566 jj_gc = 0;
567 for (int i = 0; i < jj_2_rtns.length; i++) {
568 JJCalls c = jj_2_rtns[i];
569 while (c != null) {
570 if (c.gen < jj_gen) c.first = null;
571 c = c.next;
572 }
573 }
574 }
575 return token;
576 }
577 token = oldToken;
578 jj_kind = kind;
579 throw generateParseException();
580 }
581
582 static private final class LookaheadSuccess extends java.lang.Error { }
583 final private LookaheadSuccess jj_ls = new LookaheadSuccess();
584 private boolean jj_scan_token(int kind) {
585 if (jj_scanpos == jj_lastpos) {
586 jj_la--;
587 if (jj_scanpos.next == null) {
588 jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
589 } else {
590 jj_lastpos = jj_scanpos = jj_scanpos.next;
591 }
592 } else {
593 jj_scanpos = jj_scanpos.next;
594 }
595 if (jj_rescan) {
596 int i = 0; Token tok = token;
597 while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
598 if (tok != null) jj_add_error_token(kind, i);
599 }
600 if (jj_scanpos.kind != kind) return true;
601 if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
602 return false;
603 }
604
605
606
607 final public Token getNextToken() {
608 if (token.next != null) token = token.next;
609 else token = token.next = token_source.getNextToken();
610 jj_ntk = -1;
611 jj_gen++;
612 return token;
613 }
614
615
616 final public Token getToken(int index) {
617 Token t = lookingAhead ? jj_scanpos : token;
618 for (int i = 0; i < index; i++) {
619 if (t.next != null) t = t.next;
620 else t = t.next = token_source.getNextToken();
621 }
622 return t;
623 }
624
625 private int jj_ntk() {
626 if ((jj_nt=token.next) == null)
627 return (jj_ntk = (token.next=token_source.getNextToken()).kind);
628 else
629 return (jj_ntk = jj_nt.kind);
630 }
631
632 private java.util.List jj_expentries = new java.util.ArrayList();
633 private int[] jj_expentry;
634 private int jj_kind = -1;
635 private int[] jj_lasttokens = new int[100];
636 private int jj_endpos;
637
638 private void jj_add_error_token(int kind, int pos) {
639 if (pos >= 100) return;
640 if (pos == jj_endpos + 1) {
641 jj_lasttokens[jj_endpos++] = kind;
642 } else if (jj_endpos != 0) {
643 jj_expentry = new int[jj_endpos];
644 for (int i = 0; i < jj_endpos; i++) {
645 jj_expentry[i] = jj_lasttokens[i];
646 }
647 boolean exists = false;
648 for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
649 int[] oldentry = (int[])(it.next());
650 if (oldentry.length == jj_expentry.length) {
651 exists = true;
652 for (int i = 0; i < jj_expentry.length; i++) {
653 if (oldentry[i] != jj_expentry[i]) {
654 exists = false;
655 break;
656 }
657 }
658 if (exists) break;
659 }
660 }
661 if (!exists) jj_expentries.add(jj_expentry);
662 if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
663 }
664 }
665
666
667 public ParseException generateParseException() {
668 jj_expentries.clear();
669 boolean[] la1tokens = new boolean[45];
670 if (jj_kind >= 0) {
671 la1tokens[jj_kind] = true;
672 jj_kind = -1;
673 }
674 for (int i = 0; i < 11; i++) {
675 if (jj_la1[i] == jj_gen) {
676 for (int j = 0; j < 32; j++) {
677 if ((jj_la1_0[i] & (1<<j)) != 0) {
678 la1tokens[j] = true;
679 }
680 if ((jj_la1_1[i] & (1<<j)) != 0) {
681 la1tokens[32+j] = true;
682 }
683 }
684 }
685 }
686 for (int i = 0; i < 45; i++) {
687 if (la1tokens[i]) {
688 jj_expentry = new int[1];
689 jj_expentry[0] = i;
690 jj_expentries.add(jj_expentry);
691 }
692 }
693 jj_endpos = 0;
694 jj_rescan_token();
695 jj_add_error_token(0, 0);
696 int[][] exptokseq = new int[jj_expentries.size()][];
697 for (int i = 0; i < jj_expentries.size(); i++) {
698 exptokseq[i] = (int[])jj_expentries.get(i);
699 }
700 return new ParseException(token, exptokseq, tokenImage);
701 }
702
703
704 final public void enable_tracing() {
705 }
706
707
708 final public void disable_tracing() {
709 }
710
711 private void jj_rescan_token() {
712 jj_rescan = true;
713 for (int i = 0; i < 4; i++) {
714 try {
715 JJCalls p = jj_2_rtns[i];
716 do {
717 if (p.gen > jj_gen) {
718 jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
719 switch (i) {
720 case 0: jj_3_1(); break;
721 case 1: jj_3_2(); break;
722 case 2: jj_3_3(); break;
723 case 3: jj_3_4(); break;
724 }
725 }
726 p = p.next;
727 } while (p != null);
728 } catch(LookaheadSuccess ls) { }
729 }
730 jj_rescan = false;
731 }
732
733 private void jj_save(int index, int xla) {
734 JJCalls p = jj_2_rtns[index];
735 while (p.gen > jj_gen) {
736 if (p.next == null) { p = p.next = new JJCalls(); break; }
737 p = p.next;
738 }
739 p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
740 }
741
742 static final class JJCalls {
743 int gen;
744 Token first;
745 int arg;
746 JJCalls next;
747 }
748
749 }