Coverage Report - com.quiotix.html.parser.HtmlFormatter
 
Classes in this File Line Coverage Branch Coverage Complexity
HtmlFormatter
47%
58/121
44%
16/36
1.917
MarginWriter
71%
28/39
66%
12/18
1.917
TagBlockRenderer
6%
2/31
0%
0/8
1.917
 
 1  
 /*
 2  
  * HtmlFormatter.java -- HTML document pretty-printer
 3  
  * Copyright (C) 1999 Quiotix Corporation.  
 4  
  *
 5  
  * This program is free software; you can redistribute it and/or modify
 6  
  * it under the terms of the GNU General Public License, version 2, as 
 7  
  * published by the Free Software Foundation.  
 8  
  *
 9  
  * This program is distributed in the hope that it will be useful,
 10  
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12  
  * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 13  
  * for more details.
 14  
  */
 15  
 
 16  
 package com.quiotix.html.parser;
 17  
 
 18  
 import java.io.BufferedOutputStream;
 19  
 import java.io.FileInputStream;
 20  
 import java.io.InputStream;
 21  
 import java.io.OutputStream;
 22  
 import java.io.PrintWriter;
 23  
 import java.util.HashSet;
 24  
 import java.util.Iterator;
 25  
 import java.util.Set;
 26  
 
 27  
 /**
 28  
  * HtmlFormatter is a Visitor which traverses an HtmlDocument, dumping the
 29  
  * contents of the document to a specified output stream.  It assumes that
 30  
  * the documents has been preprocessed by HtmlCollector (which matches up
 31  
  * beginning and end tags) and by HtmlScrubber (which formats tags in a
 32  
  * consistent way).  In particular, HtmlScrubber should be invoked with the
 33  
  * TRIM_SPACES option to remove trailing spaces, which can confuse the
 34  
  * formatting algorithm.
 35  
  * <p>
 36  
  * The right margin and indent increment can be specified as properties.
 37  
  * </p>
 38  
  * 
 39  
  * @author Brian Goetz, Quiotix
 40  
  * @see com.quiotix.html.parser.HtmlVisitor
 41  
  * @see com.quiotix.html.parser.HtmlCollector
 42  
  * @see com.quiotix.html.parser.HtmlScrubber
 43  
  */
 44  
 
 45  
 public class HtmlFormatter extends HtmlVisitor {
 46  
     protected MarginWriter out;
 47  4
     protected int rightMargin = 80;
 48  4
     protected int indentSize = 2;
 49  2
     protected static Set tagsIndentBlock = new HashSet();
 50  2
     protected static Set tagsNewlineBefore = new HashSet();
 51  2
     protected static Set tagsPreformatted = new HashSet();
 52  2
     protected static Set tagsTryMatch = new HashSet();
 53  2
     protected static final String[] tagsIndentStrings
 54  
             = {"TABLE", "TR", "TD", "TH", "FORM", "HTML", "HEAD", "BODY", "SELECT", "OL", "UL", "LI"};
 55  2
     protected static final String[] tagsNewlineBeforeStrings
 56  
             = {"P", "H1", "H2", "H3", "H4", "H5", "H6", "BR"};
 57  2
     protected static final String[] tagsPreformattedStrings
 58  
             = {"PRE", "SCRIPT", "STYLE"};
 59  2
     protected static final String[] tagsTryMatchStrings
 60  
             = {"A", "TD", "TH", "TR", "I", "B", "EM", "FONT", "TT", "UL", "OL", "LI"};
 61  
 
 62  
     static {
 63  26
         for (int i = 0; i < tagsIndentStrings.length; i++)
 64  24
             tagsIndentBlock.add(tagsIndentStrings[i]);
 65  18
         for (int i = 0; i < tagsNewlineBeforeStrings.length; i++)
 66  16
             tagsNewlineBefore.add(tagsNewlineBeforeStrings[i]);
 67  8
         for (int i = 0; i < tagsPreformattedStrings.length; i++)
 68  6
             tagsPreformatted.add(tagsPreformattedStrings[i]);
 69  26
         for (int i = 0; i < tagsTryMatchStrings.length; i++)
 70  24
             tagsTryMatch.add(tagsTryMatchStrings[i]);
 71  2
     }
 72  4
     protected TagBlockRenderer blockRenderer = new TagBlockRenderer();
 73  
     protected HtmlDocument.HtmlElement previousElement;
 74  
     protected boolean inPreBlock;
 75  
 
 76  
     /** Constructor. */
 77  4
     public HtmlFormatter(OutputStream os) throws Exception {
 78  4
         out = new MarginWriter(new PrintWriter(new BufferedOutputStream(os)));
 79  4
         out.setRightMargin(rightMargin);
 80  4
     }
 81  
 
 82  
     /**
 83  
      * @param margin the right margin column to wrap at
 84  
      */
 85  
     public void setRightMargin(int margin) {
 86  0
         rightMargin = margin;
 87  0
         out.setRightMargin(rightMargin);
 88  0
     }
 89  
 
 90  
     /**
 91  
      * @param indent the number of spaces to indent by
 92  
      */
 93  
     public void setIndent(int indent) {
 94  0
         indentSize = indent;
 95  0
     }
 96  
 
 97  
     public void visit(HtmlDocument.TagBlock block) {
 98  
         boolean indent;
 99  
         boolean preformat;
 100  0
         int wasMargin = 0;
 101  
 
 102  0
         if (tagsTryMatch.contains(block.startTag.tagName.toUpperCase())) {
 103  0
             blockRenderer.start();
 104  0
             blockRenderer.setTargetWidth(out.getRightMargin() - out.getLeftMargin());
 105  0
             blockRenderer.visit(block);
 106  0
             blockRenderer.finish();
 107  0
             if (!blockRenderer.hasBlownTarget()) {
 108  0
                 out.printAutoWrap(blockRenderer.getString());
 109  0
                 previousElement = block.endTag;
 110  0
                 return;
 111  
             } 
 112  
 
 113  
         }
 114  
 
 115  
         // Only will get here if we've failed the try-block test
 116  0
         indent = tagsIndentBlock.contains(block.startTag.tagName.toUpperCase());
 117  0
         preformat = tagsPreformatted.contains(block.startTag.tagName.toUpperCase());
 118  0
         if (preformat) {
 119  0
             inPreBlock = true;
 120  0
             visit(block.startTag);
 121  0
             wasMargin = out.getLeftMargin();
 122  0
             out.setLeftMargin(0);
 123  0
             visit(block.body);
 124  0
             out.setLeftMargin(wasMargin);
 125  0
             visit(block.endTag);
 126  0
         } else if (indent) {
 127  0
             out.printlnSoft();
 128  0
             visit(block.startTag);
 129  0
             out.printlnSoft();
 130  0
             out.setLeftMargin(out.getLeftMargin() + indentSize);
 131  0
             visit(block.body);
 132  0
             out.setLeftMargin(out.getLeftMargin() - indentSize);
 133  0
             out.printlnSoft();
 134  0
             visit(block.endTag);
 135  0
             out.printlnSoft();
 136  0
             inPreBlock = false;
 137  
         } else {
 138  0
             visit(block.startTag);
 139  0
             visit(block.body);
 140  0
             visit(block.endTag);
 141  
         }
 142  0
     }
 143  
 
 144  
     public void visit(HtmlDocument.Tag t) {
 145  20
         String s = t.toString();
 146  
         int hanging;
 147  
 
 148  20
         if (tagsNewlineBefore.contains(t.tagName.toUpperCase())
 149  
                 || out.getCurPosition() + s.length() > out.getRightMargin())
 150  8
             out.printlnSoft();
 151  
 
 152  20
         out.print("<" + t.tagName);
 153  20
         hanging = t.tagName.length() + 1;
 154  20
         for (Iterator it = t.attributeList.attributes.iterator(); it.hasNext();) {
 155  4
             HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
 156  4
             out.printAutoWrap(" " + a.toString(), hanging);
 157  4
         }
 158  20
         if (t.emptyTag) out.print("/");
 159  20
         out.print(">");
 160  20
         previousElement = t;
 161  20
     }
 162  
 
 163  
     public void visit(HtmlDocument.EndTag t) {
 164  0
         out.printAutoWrap(t.toString());
 165  0
         if (tagsNewlineBefore.contains(t.tagName.toUpperCase())) {
 166  0
             out.printlnSoft();
 167  0
             out.println();
 168  
         }
 169  0
         previousElement = t;
 170  0
     }
 171  
 
 172  
     public void visit(HtmlDocument.Comment c) {
 173  0
         out.print(c.toString());
 174  0
         previousElement = c;
 175  0
     }
 176  
 
 177  
     public void visit(HtmlDocument.Text t) {
 178  8
         if (inPreBlock)
 179  0
             out.print(t.text);
 180  
         else {
 181  8
             int start = 0;
 182  46
             while (start < t.text.length()) {
 183  38
                 int index = t.text.indexOf(' ', start) + 1;
 184  38
                 if (index == 0)
 185  0
                     index = t.text.length();
 186  38
                 out.printAutoWrap(t.text.substring(start, index));
 187  38
                 start = index;
 188  38
             }
 189  
         }
 190  8
         previousElement = t;
 191  8
     }
 192  
 
 193  
     public void visit(HtmlDocument.Newline n) {
 194  4
         if (inPreBlock)
 195  0
             out.println();
 196  4
         else if (previousElement instanceof HtmlDocument.Tag
 197  
                 || previousElement instanceof HtmlDocument.EndTag
 198  
                 || previousElement instanceof HtmlDocument.Comment
 199  
                 || previousElement instanceof HtmlDocument.Newline)
 200  0
             out.printlnSoft();
 201  4
         else if (previousElement instanceof HtmlDocument.Text)
 202  4
             out.print(" ");
 203  4
         previousElement = n;
 204  4
     }
 205  
 
 206  
     public void start() {
 207  4
         previousElement = null;
 208  4
         inPreBlock = false;
 209  4
     }
 210  
 
 211  
     public void finish() {
 212  4
         out.flush();
 213  4
     }
 214  
 
 215  
     /**
 216  
      * Runnable.
 217  
      */
 218  
     public static void main(String[] args) throws Exception {
 219  0
         InputStream r = new FileInputStream(args[0]);
 220  
         HtmlDocument document;
 221  
 
 222  
         try {
 223  0
             document = new HtmlParser(r).HtmlDocument();
 224  0
             document.accept(new HtmlCollector());
 225  0
             document.accept(new HtmlScrubber(HtmlScrubber.DEFAULT_OPTIONS
 226  
                     | HtmlScrubber.TRIM_SPACES));
 227  0
             document.accept(new HtmlFormatter(System.out));
 228  0
         } catch (Exception e) {
 229  0
             e.printStackTrace();
 230  
         } finally {
 231  0
             r.close();
 232  0
         }
 233  0
     }
 234  
 }
 235  
 
 236  
 
 237  
 /**
 238  
  * Utility class, used by HtmlFormatter, which adds some word-wrapping
 239  
  * and hanging indent functionality to a PrintWriter.
 240  
  */
 241  
 
 242  
 class MarginWriter {
 243  
     protected int tabStop;
 244  
     protected int curPosition;
 245  
     protected int leftMargin;
 246  
     protected int rightMargin;
 247  
     protected java.io.PrintWriter out;
 248  4
     protected char[] spaces = new char[256];
 249  
 
 250  
     /** Constructor. */
 251  4
     MarginWriter(java.io.PrintWriter out) {
 252  4
         this.out = out;
 253  1028
         for (int i = 0; i < spaces.length; i++)
 254  1024
             spaces[i] = ' ';
 255  4
     }
 256  
 
 257  
     void flush() {
 258  4
         out.flush();
 259  4
     }
 260  
 
 261  
     void close() {
 262  0
         out.close();
 263  0
     }
 264  
 
 265  
     void print(String s) {
 266  86
         if (curPosition == 0 && leftMargin > 0) {
 267  0
             out.write(spaces, 0, leftMargin);
 268  0
             curPosition = leftMargin;
 269  
         }
 270  86
         out.print(s);
 271  86
         curPosition += s.length();
 272  86
     }
 273  
 
 274  
     void printAutoWrap(String s) {
 275  38
         if (curPosition > leftMargin
 276  
                 && curPosition + s.length() > rightMargin)
 277  0
             println();
 278  38
         print(s);
 279  38
     }
 280  
 
 281  
     void printAutoWrap(String s, int hanging) {
 282  4
         if (curPosition > leftMargin
 283  
                 && curPosition + s.length() > rightMargin) {
 284  0
             println();
 285  0
             out.write(spaces, 0, hanging + leftMargin);
 286  0
             curPosition = leftMargin + hanging;
 287  
         };
 288  4
         print(s);
 289  4
     }
 290  
 
 291  
     void println() {
 292  8
         curPosition = 0;
 293  8
         out.println();
 294  8
     }
 295  
 
 296  
     void printlnSoft() {
 297  8
         if (curPosition > 0)
 298  8
             println();
 299  8
     }
 300  
 
 301  
     void setLeftMargin(int leftMargin) {
 302  0
         this.leftMargin = leftMargin;
 303  0
     }
 304  
 
 305  
     int getLeftMargin() {
 306  0
         return leftMargin;
 307  
     }
 308  
 
 309  
     void setRightMargin(int rightMargin) {
 310  4
         this.rightMargin = rightMargin;
 311  4
     }
 312  
 
 313  
     int getRightMargin() {
 314  12
         return rightMargin;
 315  
     }
 316  
 
 317  
     int getCurPosition() {
 318  12
         return (curPosition == 0 ? leftMargin : curPosition);
 319  
     }
 320  
 }
 321  
 
 322  
 /**
 323  
  * Utility class, used by HtmlFormatter, which tentatively tries to format
 324  
  * the contents of an HtmlDocument.TagBlock to see if the entire block can
 325  
  * fit on the rest of the line.  If it cannot, it gives up and indicates
 326  
  * failure through the hasBlownTarget method; if it can, the contents can
 327  
  * be retrieved through the getString method.
 328  
  */
 329  
 
 330  4
 class TagBlockRenderer extends HtmlVisitor {
 331  
     protected String s;
 332  
     protected boolean multiLine;
 333  
     protected boolean blownTarget;
 334  4
     protected int targetWidth = 80;
 335  
 
 336  
     public void start() {
 337  0
         s = "";
 338  0
         multiLine = false;
 339  0
         blownTarget = false;
 340  0
     }
 341  
 
 342  
     public void finish() {
 343  0
     }
 344  
 
 345  
     void setTargetWidth(int w) {
 346  0
         targetWidth = w;
 347  0
     }
 348  
 
 349  
     String getString() {
 350  0
         return s;
 351  
     }
 352  
 
 353  
     boolean isMultiLine() {
 354  0
         return multiLine;
 355  
     }
 356  
 
 357  
     boolean hasBlownTarget() {
 358  0
         return blownTarget;
 359  
     }
 360  
 
 361  
     public void visit(HtmlDocument.Tag t) {
 362  0
         if (s.length() < targetWidth)
 363  0
             s += t.toString();
 364  
         else
 365  0
             blownTarget = true;
 366  0
     }
 367  
 
 368  
     public void visit(HtmlDocument.EndTag t) {
 369  0
         if (s.length() < targetWidth)
 370  0
             s += t.toString();
 371  
         else
 372  0
             blownTarget = true;
 373  0
     }
 374  
 
 375  
     public void visit(HtmlDocument.Comment c) {
 376  0
         if (s.length() < targetWidth)
 377  0
             s += c.toString();
 378  
         else
 379  0
             blownTarget = true;
 380  0
     }
 381  
 
 382  
     public void visit(HtmlDocument.Text t) {
 383  0
         if (s.length() < targetWidth)
 384  0
             s += t.toString();
 385  
         else
 386  0
             blownTarget = true;
 387  0
     }
 388  
 
 389  
     public void visit(HtmlDocument.Newline n) {
 390  0
         multiLine = true;
 391  0
         s += " ";
 392  0
     }
 393  
 }
 394  
 
 395