1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package com.quiotix.html.parser;
17
18 import java.io.BufferedOutputStream;
19 import java.io.FileInputStream;
20 import java.io.InputStream;
21 import java.io.OutputStream;
22 import java.io.PrintWriter;
23 import java.util.HashSet;
24 import java.util.Iterator;
25 import java.util.Set;
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 public class HtmlFormatter extends HtmlVisitor {
46 protected MarginWriter out;
47 protected int rightMargin = 80;
48 protected int indentSize = 2;
49 protected static Set tagsIndentBlock = new HashSet();
50 protected static Set tagsNewlineBefore = new HashSet();
51 protected static Set tagsPreformatted = new HashSet();
52 protected static Set tagsTryMatch = new HashSet();
53 protected static final String[] tagsIndentStrings
54 = {"TABLE", "TR", "TD", "TH", "FORM", "HTML", "HEAD", "BODY", "SELECT", "OL", "UL", "LI"};
55 protected static final String[] tagsNewlineBeforeStrings
56 = {"P", "H1", "H2", "H3", "H4", "H5", "H6", "BR"};
57 protected static final String[] tagsPreformattedStrings
58 = {"PRE", "SCRIPT", "STYLE"};
59 protected static final String[] tagsTryMatchStrings
60 = {"A", "TD", "TH", "TR", "I", "B", "EM", "FONT", "TT", "UL", "OL", "LI"};
61
62 static {
63 for (int i = 0; i < tagsIndentStrings.length; i++)
64 tagsIndentBlock.add(tagsIndentStrings[i]);
65 for (int i = 0; i < tagsNewlineBeforeStrings.length; i++)
66 tagsNewlineBefore.add(tagsNewlineBeforeStrings[i]);
67 for (int i = 0; i < tagsPreformattedStrings.length; i++)
68 tagsPreformatted.add(tagsPreformattedStrings[i]);
69 for (int i = 0; i < tagsTryMatchStrings.length; i++)
70 tagsTryMatch.add(tagsTryMatchStrings[i]);
71 }
72 protected TagBlockRenderer blockRenderer = new TagBlockRenderer();
73 protected HtmlDocument.HtmlElement previousElement;
74 protected boolean inPreBlock;
75
76
77 public HtmlFormatter(OutputStream os) throws Exception {
78 out = new MarginWriter(new PrintWriter(new BufferedOutputStream(os)));
79 out.setRightMargin(rightMargin);
80 }
81
82
83
84
85 public void setRightMargin(int margin) {
86 rightMargin = margin;
87 out.setRightMargin(rightMargin);
88 }
89
90
91
92
93 public void setIndent(int indent) {
94 indentSize = indent;
95 }
96
97 public void visit(HtmlDocument.TagBlock block) {
98 boolean indent;
99 boolean preformat;
100 int wasMargin = 0;
101
102 if (tagsTryMatch.contains(block.startTag.tagName.toUpperCase())) {
103 blockRenderer.start();
104 blockRenderer.setTargetWidth(out.getRightMargin() - out.getLeftMargin());
105 blockRenderer.visit(block);
106 blockRenderer.finish();
107 if (!blockRenderer.hasBlownTarget()) {
108 out.printAutoWrap(blockRenderer.getString());
109 previousElement = block.endTag;
110 return;
111 }
112
113 }
114
115
116 indent = tagsIndentBlock.contains(block.startTag.tagName.toUpperCase());
117 preformat = tagsPreformatted.contains(block.startTag.tagName.toUpperCase());
118 if (preformat) {
119 inPreBlock = true;
120 visit(block.startTag);
121 wasMargin = out.getLeftMargin();
122 out.setLeftMargin(0);
123 visit(block.body);
124 out.setLeftMargin(wasMargin);
125 visit(block.endTag);
126 } else if (indent) {
127 out.printlnSoft();
128 visit(block.startTag);
129 out.printlnSoft();
130 out.setLeftMargin(out.getLeftMargin() + indentSize);
131 visit(block.body);
132 out.setLeftMargin(out.getLeftMargin() - indentSize);
133 out.printlnSoft();
134 visit(block.endTag);
135 out.printlnSoft();
136 inPreBlock = false;
137 } else {
138 visit(block.startTag);
139 visit(block.body);
140 visit(block.endTag);
141 }
142 }
143
144 public void visit(HtmlDocument.Tag t) {
145 String s = t.toString();
146 int hanging;
147
148 if (tagsNewlineBefore.contains(t.tagName.toUpperCase())
149 || out.getCurPosition() + s.length() > out.getRightMargin())
150 out.printlnSoft();
151
152 out.print("<" + t.tagName);
153 hanging = t.tagName.length() + 1;
154 for (Iterator it = t.attributeList.attributes.iterator(); it.hasNext();) {
155 HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
156 out.printAutoWrap(" " + a.toString(), hanging);
157 }
158 if (t.emptyTag) out.print("/");
159 out.print(">");
160 previousElement = t;
161 }
162
163 public void visit(HtmlDocument.EndTag t) {
164 out.printAutoWrap(t.toString());
165 if (tagsNewlineBefore.contains(t.tagName.toUpperCase())) {
166 out.printlnSoft();
167 out.println();
168 }
169 previousElement = t;
170 }
171
172 public void visit(HtmlDocument.Comment c) {
173 out.print(c.toString());
174 previousElement = c;
175 }
176
177 public void visit(HtmlDocument.Text t) {
178 if (inPreBlock)
179 out.print(t.text);
180 else {
181 int start = 0;
182 while (start < t.text.length()) {
183 int index = t.text.indexOf(' ', start) + 1;
184 if (index == 0)
185 index = t.text.length();
186 out.printAutoWrap(t.text.substring(start, index));
187 start = index;
188 }
189 }
190 previousElement = t;
191 }
192
193 public void visit(HtmlDocument.Newline n) {
194 if (inPreBlock)
195 out.println();
196 else if (previousElement instanceof HtmlDocument.Tag
197 || previousElement instanceof HtmlDocument.EndTag
198 || previousElement instanceof HtmlDocument.Comment
199 || previousElement instanceof HtmlDocument.Newline)
200 out.printlnSoft();
201 else if (previousElement instanceof HtmlDocument.Text)
202 out.print(" ");
203 previousElement = n;
204 }
205
206 public void start() {
207 previousElement = null;
208 inPreBlock = false;
209 }
210
211 public void finish() {
212 out.flush();
213 }
214
215
216
217
218 public static void main(String[] args) throws Exception {
219 InputStream r = new FileInputStream(args[0]);
220 HtmlDocument document;
221
222 try {
223 document = new HtmlParser(r).HtmlDocument();
224 document.accept(new HtmlCollector());
225 document.accept(new HtmlScrubber(HtmlScrubber.DEFAULT_OPTIONS
226 | HtmlScrubber.TRIM_SPACES));
227 document.accept(new HtmlFormatter(System.out));
228 } catch (Exception e) {
229 e.printStackTrace();
230 } finally {
231 r.close();
232 }
233 }
234 }
235
236
237
238
239
240
241
242 class MarginWriter {
243 protected int tabStop;
244 protected int curPosition;
245 protected int leftMargin;
246 protected int rightMargin;
247 protected java.io.PrintWriter out;
248 protected char[] spaces = new char[256];
249
250
251 MarginWriter(java.io.PrintWriter out) {
252 this.out = out;
253 for (int i = 0; i < spaces.length; i++)
254 spaces[i] = ' ';
255 }
256
257 void flush() {
258 out.flush();
259 }
260
261 void close() {
262 out.close();
263 }
264
265 void print(String s) {
266 if (curPosition == 0 && leftMargin > 0) {
267 out.write(spaces, 0, leftMargin);
268 curPosition = leftMargin;
269 }
270 out.print(s);
271 curPosition += s.length();
272 }
273
274 void printAutoWrap(String s) {
275 if (curPosition > leftMargin
276 && curPosition + s.length() > rightMargin)
277 println();
278 print(s);
279 }
280
281 void printAutoWrap(String s, int hanging) {
282 if (curPosition > leftMargin
283 && curPosition + s.length() > rightMargin) {
284 println();
285 out.write(spaces, 0, hanging + leftMargin);
286 curPosition = leftMargin + hanging;
287 };
288 print(s);
289 }
290
291 void println() {
292 curPosition = 0;
293 out.println();
294 }
295
296 void printlnSoft() {
297 if (curPosition > 0)
298 println();
299 }
300
301 void setLeftMargin(int leftMargin) {
302 this.leftMargin = leftMargin;
303 }
304
305 int getLeftMargin() {
306 return leftMargin;
307 }
308
309 void setRightMargin(int rightMargin) {
310 this.rightMargin = rightMargin;
311 }
312
313 int getRightMargin() {
314 return rightMargin;
315 }
316
317 int getCurPosition() {
318 return (curPosition == 0 ? leftMargin : curPosition);
319 }
320 }
321
322
323
324
325
326
327
328
329
330 class TagBlockRenderer extends HtmlVisitor {
331 protected String s;
332 protected boolean multiLine;
333 protected boolean blownTarget;
334 protected int targetWidth = 80;
335
336 public void start() {
337 s = "";
338 multiLine = false;
339 blownTarget = false;
340 }
341
342 public void finish() {
343 }
344
345 void setTargetWidth(int w) {
346 targetWidth = w;
347 }
348
349 String getString() {
350 return s;
351 }
352
353 boolean isMultiLine() {
354 return multiLine;
355 }
356
357 boolean hasBlownTarget() {
358 return blownTarget;
359 }
360
361 public void visit(HtmlDocument.Tag t) {
362 if (s.length() < targetWidth)
363 s += t.toString();
364 else
365 blownTarget = true;
366 }
367
368 public void visit(HtmlDocument.EndTag t) {
369 if (s.length() < targetWidth)
370 s += t.toString();
371 else
372 blownTarget = true;
373 }
374
375 public void visit(HtmlDocument.Comment c) {
376 if (s.length() < targetWidth)
377 s += c.toString();
378 else
379 blownTarget = true;
380 }
381
382 public void visit(HtmlDocument.Text t) {
383 if (s.length() < targetWidth)
384 s += t.toString();
385 else
386 blownTarget = true;
387 }
388
389 public void visit(HtmlDocument.Newline n) {
390 multiLine = true;
391 s += " ";
392 }
393 }
394
395