View Javadoc

1   /*
2    * HtmlFormatter.java -- HTML document pretty-printer
3    * Copyright (C) 1999 Quiotix Corporation.  
4    *
5    * This program is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License, version 2, as 
7    * published by the Free Software Foundation.  
8    *
9    * This program is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
13   * for more details.
14   */
15  
16  package com.quiotix.html.parser;
17  
18  import java.io.BufferedOutputStream;
19  import java.io.FileInputStream;
20  import java.io.InputStream;
21  import java.io.OutputStream;
22  import java.io.PrintWriter;
23  import java.util.HashSet;
24  import java.util.Iterator;
25  import java.util.Set;
26  
27  /**
28   * HtmlFormatter is a Visitor which traverses an HtmlDocument, dumping the
29   * contents of the document to a specified output stream.  It assumes that
30   * the documents has been preprocessed by HtmlCollector (which matches up
31   * beginning and end tags) and by HtmlScrubber (which formats tags in a
32   * consistent way).  In particular, HtmlScrubber should be invoked with the
33   * TRIM_SPACES option to remove trailing spaces, which can confuse the
34   * formatting algorithm.
35   * <p>
36   * The right margin and indent increment can be specified as properties.
37   * </p>
38   * 
39   * @author Brian Goetz, Quiotix
40   * @see com.quiotix.html.parser.HtmlVisitor
41   * @see com.quiotix.html.parser.HtmlCollector
42   * @see com.quiotix.html.parser.HtmlScrubber
43   */
44  
45  public class HtmlFormatter extends HtmlVisitor {
46      protected MarginWriter out;
47      protected int rightMargin = 80;
48      protected int indentSize = 2;
49      protected static Set tagsIndentBlock = new HashSet();
50      protected static Set tagsNewlineBefore = new HashSet();
51      protected static Set tagsPreformatted = new HashSet();
52      protected static Set tagsTryMatch = new HashSet();
53      protected static final String[] tagsIndentStrings
54              = {"TABLE", "TR", "TD", "TH", "FORM", "HTML", "HEAD", "BODY", "SELECT", "OL", "UL", "LI"};
55      protected static final String[] tagsNewlineBeforeStrings
56              = {"P", "H1", "H2", "H3", "H4", "H5", "H6", "BR"};
57      protected static final String[] tagsPreformattedStrings
58              = {"PRE", "SCRIPT", "STYLE"};
59      protected static final String[] tagsTryMatchStrings
60              = {"A", "TD", "TH", "TR", "I", "B", "EM", "FONT", "TT", "UL", "OL", "LI"};
61  
62      static {
63          for (int i = 0; i < tagsIndentStrings.length; i++)
64              tagsIndentBlock.add(tagsIndentStrings[i]);
65          for (int i = 0; i < tagsNewlineBeforeStrings.length; i++)
66              tagsNewlineBefore.add(tagsNewlineBeforeStrings[i]);
67          for (int i = 0; i < tagsPreformattedStrings.length; i++)
68              tagsPreformatted.add(tagsPreformattedStrings[i]);
69          for (int i = 0; i < tagsTryMatchStrings.length; i++)
70              tagsTryMatch.add(tagsTryMatchStrings[i]);
71      }
72      protected TagBlockRenderer blockRenderer = new TagBlockRenderer();
73      protected HtmlDocument.HtmlElement previousElement;
74      protected boolean inPreBlock;
75  
76      /** Constructor. */
77      public HtmlFormatter(OutputStream os) throws Exception {
78          out = new MarginWriter(new PrintWriter(new BufferedOutputStream(os)));
79          out.setRightMargin(rightMargin);
80      }
81  
82      /**
83       * @param margin the right margin column to wrap at
84       */
85      public void setRightMargin(int margin) {
86          rightMargin = margin;
87          out.setRightMargin(rightMargin);
88      }
89  
90      /**
91       * @param indent the number of spaces to indent by
92       */
93      public void setIndent(int indent) {
94          indentSize = indent;
95      }
96  
97      public void visit(HtmlDocument.TagBlock block) {
98          boolean indent;
99          boolean preformat;
100         int wasMargin = 0;
101 
102         if (tagsTryMatch.contains(block.startTag.tagName.toUpperCase())) {
103             blockRenderer.start();
104             blockRenderer.setTargetWidth(out.getRightMargin() - out.getLeftMargin());
105             blockRenderer.visit(block);
106             blockRenderer.finish();
107             if (!blockRenderer.hasBlownTarget()) {
108                 out.printAutoWrap(blockRenderer.getString());
109                 previousElement = block.endTag;
110                 return;
111             } 
112 
113         }
114 
115         // Only will get here if we've failed the try-block test
116         indent = tagsIndentBlock.contains(block.startTag.tagName.toUpperCase());
117         preformat = tagsPreformatted.contains(block.startTag.tagName.toUpperCase());
118         if (preformat) {
119             inPreBlock = true;
120             visit(block.startTag);
121             wasMargin = out.getLeftMargin();
122             out.setLeftMargin(0);
123             visit(block.body);
124             out.setLeftMargin(wasMargin);
125             visit(block.endTag);
126         } else if (indent) {
127             out.printlnSoft();
128             visit(block.startTag);
129             out.printlnSoft();
130             out.setLeftMargin(out.getLeftMargin() + indentSize);
131             visit(block.body);
132             out.setLeftMargin(out.getLeftMargin() - indentSize);
133             out.printlnSoft();
134             visit(block.endTag);
135             out.printlnSoft();
136             inPreBlock = false;
137         } else {
138             visit(block.startTag);
139             visit(block.body);
140             visit(block.endTag);
141         }
142     }
143 
144     public void visit(HtmlDocument.Tag t) {
145         String s = t.toString();
146         int hanging;
147 
148         if (tagsNewlineBefore.contains(t.tagName.toUpperCase())
149                 || out.getCurPosition() + s.length() > out.getRightMargin())
150             out.printlnSoft();
151 
152         out.print("<" + t.tagName);
153         hanging = t.tagName.length() + 1;
154         for (Iterator it = t.attributeList.attributes.iterator(); it.hasNext();) {
155             HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
156             out.printAutoWrap(" " + a.toString(), hanging);
157         }
158         if (t.emptyTag) out.print("/");
159         out.print(">");
160         previousElement = t;
161     }
162 
163     public void visit(HtmlDocument.EndTag t) {
164         out.printAutoWrap(t.toString());
165         if (tagsNewlineBefore.contains(t.tagName.toUpperCase())) {
166             out.printlnSoft();
167             out.println();
168         }
169         previousElement = t;
170     }
171 
172     public void visit(HtmlDocument.Comment c) {
173         out.print(c.toString());
174         previousElement = c;
175     }
176 
177     public void visit(HtmlDocument.Text t) {
178         if (inPreBlock)
179             out.print(t.text);
180         else {
181             int start = 0;
182             while (start < t.text.length()) {
183                 int index = t.text.indexOf(' ', start) + 1;
184                 if (index == 0)
185                     index = t.text.length();
186                 out.printAutoWrap(t.text.substring(start, index));
187                 start = index;
188             }
189         }
190         previousElement = t;
191     }
192 
193     public void visit(HtmlDocument.Newline n) {
194         if (inPreBlock)
195             out.println();
196         else if (previousElement instanceof HtmlDocument.Tag
197                 || previousElement instanceof HtmlDocument.EndTag
198                 || previousElement instanceof HtmlDocument.Comment
199                 || previousElement instanceof HtmlDocument.Newline)
200             out.printlnSoft();
201         else if (previousElement instanceof HtmlDocument.Text)
202             out.print(" ");
203         previousElement = n;
204     }
205 
206     public void start() {
207         previousElement = null;
208         inPreBlock = false;
209     }
210 
211     public void finish() {
212         out.flush();
213     }
214 
215     /**
216      * Runnable.
217      */
218     public static void main(String[] args) throws Exception {
219         InputStream r = new FileInputStream(args[0]);
220         HtmlDocument document;
221 
222         try {
223             document = new HtmlParser(r).HtmlDocument();
224             document.accept(new HtmlCollector());
225             document.accept(new HtmlScrubber(HtmlScrubber.DEFAULT_OPTIONS
226                     | HtmlScrubber.TRIM_SPACES));
227             document.accept(new HtmlFormatter(System.out));
228         } catch (Exception e) {
229             e.printStackTrace();
230         } finally {
231             r.close();
232         }
233     }
234 }
235 
236 
237 /**
238  * Utility class, used by HtmlFormatter, which adds some word-wrapping
239  * and hanging indent functionality to a PrintWriter.
240  */
241 
242 class MarginWriter {
243     protected int tabStop;
244     protected int curPosition;
245     protected int leftMargin;
246     protected int rightMargin;
247     protected java.io.PrintWriter out;
248     protected char[] spaces = new char[256];
249 
250     /** Constructor. */
251     MarginWriter(java.io.PrintWriter out) {
252         this.out = out;
253         for (int i = 0; i < spaces.length; i++)
254             spaces[i] = ' ';
255     }
256 
257     void flush() {
258         out.flush();
259     }
260 
261     void close() {
262         out.close();
263     }
264 
265     void print(String s) {
266         if (curPosition == 0 && leftMargin > 0) {
267             out.write(spaces, 0, leftMargin);
268             curPosition = leftMargin;
269         }
270         out.print(s);
271         curPosition += s.length();
272     }
273 
274     void printAutoWrap(String s) {
275         if (curPosition > leftMargin
276                 && curPosition + s.length() > rightMargin)
277             println();
278         print(s);
279     }
280 
281     void printAutoWrap(String s, int hanging) {
282         if (curPosition > leftMargin
283                 && curPosition + s.length() > rightMargin) {
284             println();
285             out.write(spaces, 0, hanging + leftMargin);
286             curPosition = leftMargin + hanging;
287         };
288         print(s);
289     }
290 
291     void println() {
292         curPosition = 0;
293         out.println();
294     }
295 
296     void printlnSoft() {
297         if (curPosition > 0)
298             println();
299     }
300 
301     void setLeftMargin(int leftMargin) {
302         this.leftMargin = leftMargin;
303     }
304 
305     int getLeftMargin() {
306         return leftMargin;
307     }
308 
309     void setRightMargin(int rightMargin) {
310         this.rightMargin = rightMargin;
311     }
312 
313     int getRightMargin() {
314         return rightMargin;
315     }
316 
317     int getCurPosition() {
318         return (curPosition == 0 ? leftMargin : curPosition);
319     }
320 }
321 
322 /**
323  * Utility class, used by HtmlFormatter, which tentatively tries to format
324  * the contents of an HtmlDocument.TagBlock to see if the entire block can
325  * fit on the rest of the line.  If it cannot, it gives up and indicates
326  * failure through the hasBlownTarget method; if it can, the contents can
327  * be retrieved through the getString method.
328  */
329 
330 class TagBlockRenderer extends HtmlVisitor {
331     protected String s;
332     protected boolean multiLine;
333     protected boolean blownTarget;
334     protected int targetWidth = 80;
335 
336     public void start() {
337         s = "";
338         multiLine = false;
339         blownTarget = false;
340     }
341 
342     public void finish() {
343     }
344 
345     void setTargetWidth(int w) {
346         targetWidth = w;
347     }
348 
349     String getString() {
350         return s;
351     }
352 
353     boolean isMultiLine() {
354         return multiLine;
355     }
356 
357     boolean hasBlownTarget() {
358         return blownTarget;
359     }
360 
361     public void visit(HtmlDocument.Tag t) {
362         if (s.length() < targetWidth)
363             s += t.toString();
364         else
365             blownTarget = true;
366     }
367 
368     public void visit(HtmlDocument.EndTag t) {
369         if (s.length() < targetWidth)
370             s += t.toString();
371         else
372             blownTarget = true;
373     }
374 
375     public void visit(HtmlDocument.Comment c) {
376         if (s.length() < targetWidth)
377             s += c.toString();
378         else
379             blownTarget = true;
380     }
381 
382     public void visit(HtmlDocument.Text t) {
383         if (s.length() < targetWidth)
384             s += t.toString();
385         else
386             blownTarget = true;
387     }
388 
389     public void visit(HtmlDocument.Newline n) {
390         multiLine = true;
391         s += " ";
392     }
393 }
394 
395