1
2
3
4 package com.quiotix.html.parser;
5
6 import java.io.ByteArrayInputStream;
7 import java.io.ByteArrayOutputStream;
8 import java.io.FileInputStream;
9 import java.io.InputStream;
10 import java.io.OutputStream;
11 import java.io.UnsupportedEncodingException;
12 import java.util.HashSet;
13 import java.util.Iterator;
14 import java.util.Set;
15
16 import com.quiotix.html.parser.HtmlDocument.Attribute;
17
18
19
20
21
22
23
24 public class HtmlStripper extends HtmlDumper {
25
26 protected static Set html1BlockTags = new HashSet();
27 protected static Set html1EmptyTags = new HashSet();
28 protected static Set html1Tags = new HashSet();
29 protected static String[] html1EmptyTagStrings = { "AREA", "BASE",
30 "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT", "ISINDEX", "LINK",
31 "META", "PARAM", "NEXTID", "PLAINTEXT", };
32 protected static String[] html1BlockTagStrings = { "A", "ADDRESS", "B",
33 "BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL",
34 "DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML",
35 "I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE",
36 "SAMP", "STRONG", "TITLE", "TT", "U", "UL", "VAR", "XMP" };
37
38 protected static String[] html4BlockTagStrings = { "A", "ADDRESS", "B",
39 "BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL",
40 "DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML",
41 "I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE",
42 "SAMP", "STRONG", "TITLE", "TABLE", "TR", "TH", "TD", "TT", "U",
43 "UL", "VAR", "XMP" };
44
45 static {
46 for (int i = 0; i < html1EmptyTagStrings.length; i++)
47 html1BlockTags.add(html1EmptyTagStrings[i]);
48 }
49 static {
50 for (int i = 0; i < html4BlockTagStrings.length; i++)
51 html1BlockTags.add(html4BlockTagStrings[i]);
52 }
53 static {
54 for (int i = 0; i < html1EmptyTagStrings.length; i++)
55 html1Tags.add(html1EmptyTagStrings[i]);
56 for (int i = 0; i < html4BlockTagStrings.length; i++)
57 html1Tags.add(html4BlockTagStrings[i]);
58 }
59
60
61
62
63 public HtmlStripper(OutputStream os) {
64 super(os);
65 }
66
67
68
69
70
71
72 public HtmlStripper(OutputStream os, String encoding)
73 throws UnsupportedEncodingException {
74 super(os, encoding);
75 }
76
77 public void visit(HtmlDocument.TagBlock tagBlock) {
78 if (tagBlock.startTag.tagName.toUpperCase().equals("STYLE")) {
79 } else if (noButSpace(tagBlock.text())) {
80 } else if (!html1BlockTags.contains(tagBlock.startTag.tagName
81 .toUpperCase())) {
82 visit(tagBlock.body);
83 } else {
84 super.visit(tagBlock);
85 }
86 }
87
88 public void visit(HtmlDocument.Tag t) {
89 if (html1Tags.contains(t.tagName.toUpperCase())) {
90 StringBuffer s = new StringBuffer();
91 s.append("<");
92 s.append(t.tagName);
93 if (!t.tagName.toUpperCase().equals("HTML")) {
94 for (Iterator iterator = t.attributeList.attributes.iterator(); iterator
95 .hasNext();) {
96 Attribute attribute = (Attribute) iterator.next();
97 if (!attribute.name.toUpperCase().equals("STYLE")) {
98 if (!attribute.name.toUpperCase().equals("CLASS")
99 && !attribute.name.toUpperCase().equals(
100 "MSONORMAL")) {
101 s.append(" ");
102 s.append(attribute.toString());
103 }
104 }
105 }
106 }
107 if (t.emptyTag)
108 s.append("/");
109 s.append(">");
110 out.print(s.toString());
111 }
112 }
113
114 public void visit(HtmlDocument.Comment comment) {
115
116 }
117
118 boolean noButSpace(String in) {
119 String s = in.toLowerCase();
120 boolean plausible = true;
121 while (plausible)
122 if (s.startsWith(" "))
123 s = s.substring(6);
124 else if (s.startsWith(" "))
125 s = s.substring(1);
126 else
127 plausible = false;
128 return s.equals("");
129 }
130
131
132
133
134 public static void main(String[] args) throws Exception {
135 InputStream r = new FileInputStream(args[0]);
136
137 try {
138 HtmlDocument document = new HtmlParser(r).HtmlDocument();
139 int scrubberFlags = HtmlScrubber.DEFAULT_OPTIONS
140 | HtmlScrubber.TRIM_SPACES | HtmlScrubber.QUOTE_ATTRS;
141
142 document.accept(new HtmlScrubber(scrubberFlags));
143 document.accept(new HtmlCollector());
144 ByteArrayOutputStream out = new ByteArrayOutputStream();
145 document.accept(new HtmlStripper(out));
146 InputStream r2 = new ByteArrayInputStream(out.toByteArray());
147 HtmlDocument document2 = new HtmlParser(r2).HtmlDocument();
148 document2.accept(new HtmlCollector());
149 HtmlFormatter formatter = new HtmlFormatter(System.out);
150 formatter.setRightMargin(60);
151 formatter.setIndent(1);
152 document2.accept(formatter);
153
154 } finally {
155 r.close();
156 }
157 }
158 }