1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package com.quiotix.html.parser;
17
18 import java.io.FileInputStream;
19 import java.io.InputStream;
20 import java.util.HashSet;
21 import java.util.Iterator;
22 import java.util.Set;
23 import java.util.Vector;
24
25
26
27
28
29
30
31
32
33
34
35 public class HtmlCollector extends HtmlVisitor {
36
37 protected ElementStack tagStack = new ElementStack();
38 protected ElementStack elements;
39 protected boolean collected;
40 protected static Set dontMatch = new HashSet();
41 protected static String[] dontMatchStrings
42 = {"AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT",
43 "ISINDEX", "LINK", "META", "PARAM"};
44
45 static {
46 for (int i = 0; i < dontMatchStrings.length; i++)
47 dontMatch.add(dontMatchStrings[i]);
48 }
49
50 private static class TagStackEntry {
51 String tagName;
52 int index;
53 }
54
55 private static class ElementStack extends Vector {
56
57 ElementStack() {
58 super();
59 }
60
61 ElementStack(int n) {
62 super(n);
63 }
64
65 void popN(int n) {
66 elementCount -= n;
67 }
68 }
69
70 protected int pushNode(HtmlDocument.HtmlElement e) {
71 elements.addElement(e);
72 return elements.size() - 1;
73 }
74
75 public void visit(HtmlDocument.Comment c) {
76 pushNode(c);
77 }
78
79 public void visit(HtmlDocument.Text t) {
80 pushNode(t);
81 }
82
83 public void visit(HtmlDocument.Newline n) {
84 pushNode(n);
85 }
86
87 public void visit(HtmlDocument.Tag t) {
88 TagStackEntry ts = new TagStackEntry();
89 int index;
90
91
92
93 index = pushNode(t);
94 if (!t.emptyTag
95 && !dontMatch.contains(t.tagName.toUpperCase())) {
96 ts.tagName = t.tagName;
97 ts.index = index;
98 tagStack.addElement(ts);
99 }
100 }
101
102 public void visit(HtmlDocument.EndTag t) {
103 int i;
104 for (i = tagStack.size() - 1; i >= 0; i--) {
105 TagStackEntry ts = (TagStackEntry) tagStack.elementAt(i);
106 if (t.tagName.equalsIgnoreCase(ts.tagName)) {
107 HtmlDocument.TagBlock block;
108 HtmlDocument.ElementSequence blockElements;
109 HtmlDocument.Tag tag;
110
111
112 blockElements =
113 new HtmlDocument.ElementSequence(elements.size() - ts.index - 1);
114 for (int j = ts.index + 1; j < elements.size(); j++)
115 blockElements.addElement((HtmlDocument.HtmlElement)
116 elements.elementAt(j));
117 tag = (HtmlDocument.Tag) elements.elementAt(ts.index);
118 block = new HtmlDocument.TagBlock(tag.tagName,
119 tag.attributeList, blockElements);
120
121
122 elements.popN(elements.size() - ts.index);
123 elements.addElement(block);
124
125
126 tagStack.popN(tagStack.size() - i);
127
128 collected = true;
129 break;
130 }
131 }
132
133
134 if (i < 0)
135 pushNode(t);
136 }
137
138 public void visit(HtmlDocument.TagBlock bl) {
139 HtmlCollector c = new HtmlCollector();
140
141 c.start();
142 c.visit(bl.body);
143 c.finish();
144 pushNode(bl);
145 }
146
147 public void visit(HtmlDocument.ElementSequence s) {
148 elements = new ElementStack(s.size());
149 collected = false;
150
151 for (Iterator iterator = s.iterator(); iterator.hasNext();) {
152 HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
153 htmlElement.accept(this);
154 }
155 if (collected)
156 s.setElements(elements);
157 }
158
159
160
161
162 public static void main(String[] args) throws Exception {
163 InputStream r = new FileInputStream(args[0]);
164
165 try {
166 HtmlDocument document = new HtmlParser(r).HtmlDocument();
167 document.accept(new HtmlScrubber());
168 document.accept(new HtmlCollector());
169 document.accept(new HtmlDumper(System.out));
170 } finally {
171 r.close();
172 }
173 }
174 }
175