1   
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  
16  package com.quiotix.html.parser;
17  
18  import java.io.FileInputStream;
19  import java.io.InputStream;
20  import java.util.HashSet;
21  import java.util.Iterator;
22  import java.util.Set;
23  import java.util.Vector;
24  
25  
26  
27  
28  
29  
30  
31  
32  
33  
34  
35  public class HtmlCollector extends HtmlVisitor {
36  
37      protected ElementStack tagStack = new ElementStack();
38      protected ElementStack elements;
39      protected boolean collected;
40      protected static Set dontMatch = new HashSet();
41      protected static String[] dontMatchStrings
42      = {"AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT",
43         "ISINDEX", "LINK", "META", "PARAM"};
44  
45      static {
46          for (int i = 0; i < dontMatchStrings.length; i++)
47              dontMatch.add(dontMatchStrings[i]);
48      }
49  
50      private static class TagStackEntry {
51          String tagName;
52          int index;
53      }
54  
55      private static class ElementStack extends Vector {
56  
57          ElementStack() {
58              super();
59          }
60  
61          ElementStack(int n) {
62              super(n);
63          }
64  
65          void popN(int n) {
66              elementCount -= n;
67          }
68      }
69  
70      protected int pushNode(HtmlDocument.HtmlElement e) {
71          elements.addElement(e);
72          return elements.size() - 1;
73      }
74  
75      public void visit(HtmlDocument.Comment c) {
76          pushNode(c);
77      }
78  
79      public void visit(HtmlDocument.Text t) {
80          pushNode(t);
81      }
82  
83      public void visit(HtmlDocument.Newline n) {
84          pushNode(n);
85      }
86  
87      public void visit(HtmlDocument.Tag t) {
88          TagStackEntry ts = new TagStackEntry();
89          int index;
90  
91          
92          
93          index = pushNode(t);
94          if (!t.emptyTag
95                  && !dontMatch.contains(t.tagName.toUpperCase())) {
96              ts.tagName = t.tagName;
97              ts.index = index;
98              tagStack.addElement(ts);
99          }
100     }
101 
102     public void visit(HtmlDocument.EndTag t) {
103         int i;
104         for (i = tagStack.size() - 1; i >= 0; i--) {
105             TagStackEntry ts = (TagStackEntry) tagStack.elementAt(i);
106             if (t.tagName.equalsIgnoreCase(ts.tagName)) {
107                 HtmlDocument.TagBlock block;
108                 HtmlDocument.ElementSequence blockElements;
109                 HtmlDocument.Tag tag;
110 
111                 
112                 blockElements =
113                         new HtmlDocument.ElementSequence(elements.size() - ts.index - 1);
114                 for (int j = ts.index + 1; j < elements.size(); j++)
115                     blockElements.addElement((HtmlDocument.HtmlElement)
116                             elements.elementAt(j));
117                 tag = (HtmlDocument.Tag) elements.elementAt(ts.index);
118                 block = new HtmlDocument.TagBlock(tag.tagName,
119                         tag.attributeList, blockElements);
120 
121                 
122                 elements.popN(elements.size() - ts.index);
123                 elements.addElement(block);
124 
125                 
126                 tagStack.popN(tagStack.size() - i);
127 
128                 collected = true;
129                 break;
130             }
131         }
132 
133         
134         if (i < 0)
135             pushNode(t);
136     }
137 
138     public void visit(HtmlDocument.TagBlock bl) {
139         HtmlCollector c = new HtmlCollector();
140 
141         c.start();
142         c.visit(bl.body);
143         c.finish();
144         pushNode(bl);
145     }
146 
147     public void visit(HtmlDocument.ElementSequence s) {
148         elements = new ElementStack(s.size());
149         collected = false;
150 
151         for (Iterator iterator = s.iterator(); iterator.hasNext();) {
152             HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
153             htmlElement.accept(this);
154         }
155         if (collected)
156             s.setElements(elements);
157     }
158 
159     
160 
161 
162     public static void main(String[] args) throws Exception {
163         InputStream r = new FileInputStream(args[0]);
164 
165         try {
166             HtmlDocument document = new HtmlParser(r).HtmlDocument();
167             document.accept(new HtmlScrubber());
168             document.accept(new HtmlCollector());
169             document.accept(new HtmlDumper(System.out));
170         } finally {
171             r.close();
172         }
173     }
174 }
175