HtmlStripper xref

View Javadoc

1   /**
2    * 
3    */
4   package com.quiotix.html.parser;
5   
6   import java.io.ByteArrayInputStream;
7   import java.io.ByteArrayOutputStream;
8   import java.io.FileInputStream;
9   import java.io.InputStream;
10  import java.io.OutputStream;
11  import java.io.UnsupportedEncodingException;
12  import java.util.HashSet;
13  import java.util.Iterator;
14  import java.util.Set;
15  
16  import com.quiotix.html.parser.HtmlDocument.Attribute;
17  
18  /**
19   * A runnable class intended to produce readable, sparse html from formatted
20   * pages.
21   * 
22   * @author timp
23   */
24  public class HtmlStripper extends HtmlDumper {
25  
26      protected static Set html1BlockTags = new HashSet();
27      protected static Set html1EmptyTags = new HashSet();
28      protected static Set html1Tags = new HashSet();
29      protected static String[] html1EmptyTagStrings = { "AREA", "BASE",
30              "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT", "ISINDEX", "LINK",
31              "META", "PARAM", "NEXTID", "PLAINTEXT", };
32      protected static String[] html1BlockTagStrings = { "A", "ADDRESS", "B",
33              "BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL",
34              "DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML",
35              "I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE",
36              "SAMP", "STRONG", "TITLE", "TT", "U", "UL", "VAR", "XMP" };
37  
38      protected static String[] html4BlockTagStrings = { "A", "ADDRESS", "B",
39              "BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL",
40              "DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML",
41              "I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE",
42              "SAMP", "STRONG", "TITLE", "TABLE", "TR", "TH", "TD", "TT", "U",
43              "UL", "VAR", "XMP" };
44  
45      static {
46          for (int i = 0; i < html1EmptyTagStrings.length; i++)
47              html1BlockTags.add(html1EmptyTagStrings[i]);
48      }
49      static {
50          for (int i = 0; i < html4BlockTagStrings.length; i++)
51              html1BlockTags.add(html4BlockTagStrings[i]);
52      }
53      static {
54          for (int i = 0; i < html1EmptyTagStrings.length; i++)
55              html1Tags.add(html1EmptyTagStrings[i]);
56          for (int i = 0; i < html4BlockTagStrings.length; i++)
57              html1Tags.add(html4BlockTagStrings[i]);
58      }
59  
60      /**
61       * @param os
62       */
63      public HtmlStripper(OutputStream os) {
64          super(os);
65      }
66  
67      /**
68       * @param os
69       * @param encoding
70       * @throws UnsupportedEncodingException
71       */
72      public HtmlStripper(OutputStream os, String encoding)
73              throws UnsupportedEncodingException {
74          super(os, encoding);
75      }
76  
77      public void visit(HtmlDocument.TagBlock tagBlock) {
78          if (tagBlock.startTag.tagName.toUpperCase().equals("STYLE")) {
79          } else if (noButSpace(tagBlock.text())) {
80          } else if (!html1BlockTags.contains(tagBlock.startTag.tagName
81                  .toUpperCase())) {
82              visit(tagBlock.body);
83          } else {
84              super.visit(tagBlock);
85          }
86      }
87  
88      public void visit(HtmlDocument.Tag t) {
89          if (html1Tags.contains(t.tagName.toUpperCase())) {
90              StringBuffer s = new StringBuffer();
91              s.append("<");
92              s.append(t.tagName);
93              if (!t.tagName.toUpperCase().equals("HTML")) {
94                  for (Iterator iterator = t.attributeList.attributes.iterator(); iterator
95                          .hasNext();) {
96                      Attribute attribute = (Attribute) iterator.next();
97                      if (!attribute.name.toUpperCase().equals("STYLE")) {
98                          if (!attribute.name.toUpperCase().equals("CLASS")
99                                  && !attribute.name.toUpperCase().equals(
100                                         "MSONORMAL")) {
101                             s.append(" ");
102                             s.append(attribute.toString());
103                         }
104                     }
105                 }
106             }
107             if (t.emptyTag)
108                 s.append("/");
109             s.append(">");
110             out.print(s.toString());
111         }
112     }
113 
114     public void visit(HtmlDocument.Comment comment) {
115         // System.err.println("in comment" + comment.comment);
116     }
117 
118     boolean noButSpace(String in) {
119         String s = in.toLowerCase();
120         boolean plausible = true;
121         while (plausible)
122             if (s.startsWith("&nbsp;"))
123                 s = s.substring(6);
124             else if (s.startsWith(" "))
125                 s = s.substring(1);
126             else
127                 plausible = false;
128         return s.equals("");
129     }
130 
131     /**
132      * Runnable.
133      */
134     public static void main(String[] args) throws Exception {
135         InputStream r = new FileInputStream(args[0]);
136 
137         try {
138             HtmlDocument document = new HtmlParser(r).HtmlDocument();
139             int scrubberFlags = HtmlScrubber.DEFAULT_OPTIONS
140                     | HtmlScrubber.TRIM_SPACES | HtmlScrubber.QUOTE_ATTRS;
141 
142             document.accept(new HtmlScrubber(scrubberFlags));
143             document.accept(new HtmlCollector());
144             ByteArrayOutputStream out = new ByteArrayOutputStream();
145             document.accept(new HtmlStripper(out));
146             InputStream r2 = new ByteArrayInputStream(out.toByteArray());
147             HtmlDocument document2 = new HtmlParser(r2).HtmlDocument();
148             document2.accept(new HtmlCollector());
149             HtmlFormatter formatter = new HtmlFormatter(System.out);
150             formatter.setRightMargin(60);
151             formatter.setIndent(1);
152             document2.accept(formatter);
153 
154         } finally {
155             r.close();
156         }
157     }
158 }