Coverage Report

Coverage Report - com.quiotix.html.parser.HtmlStripper

Classes in this File

Line Coverage

Branch Coverage

Complexity

HtmlStripper

0/70

0/26

2.857

 /**
  * 
  */
 package com.quiotix.html.parser;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Set;
 
 import com.quiotix.html.parser.HtmlDocument.Attribute;
 
 /**
  * A runnable class intended to produce readable, sparse html from formatted
  * pages.
  * 
  * @author timp
  */
 public class HtmlStripper extends HtmlDumper {
 
     protected static Set html1BlockTags = new HashSet();
     protected static Set html1EmptyTags = new HashSet();
     protected static Set html1Tags = new HashSet();
     protected static String[] html1EmptyTagStrings = { "AREA", "BASE",
             "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT", "ISINDEX", "LINK",
             "META", "PARAM", "NEXTID", "PLAINTEXT", };
     protected static String[] html1BlockTagStrings = { "A", "ADDRESS", "B",
             "BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL",
             "DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML",
             "I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE",
             "SAMP", "STRONG", "TITLE", "TT", "U", "UL", "VAR", "XMP" };
 
     protected static String[] html4BlockTagStrings = { "A", "ADDRESS", "B",
             "BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL",
             "DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML",
             "I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE",
             "SAMP", "STRONG", "TITLE", "TABLE", "TR", "TH", "TD", "TT", "U",
             "UL", "VAR", "XMP" };
 
     static {
         for (int i = 0; i < html1EmptyTagStrings.length; i++)
             html1BlockTags.add(html1EmptyTagStrings[i]);
     }
     static {
         for (int i = 0; i < html4BlockTagStrings.length; i++)
             html1BlockTags.add(html4BlockTagStrings[i]);
     }
     static {
         for (int i = 0; i < html1EmptyTagStrings.length; i++)
             html1Tags.add(html1EmptyTagStrings[i]);
         for (int i = 0; i < html4BlockTagStrings.length; i++)
             html1Tags.add(html4BlockTagStrings[i]);
     }
 
     /**
      * @param os
      */
     public HtmlStripper(OutputStream os) {
         super(os);
     }
 
     /**
      * @param os
      * @param encoding
      * @throws UnsupportedEncodingException
      */
     public HtmlStripper(OutputStream os, String encoding)
             throws UnsupportedEncodingException {
         super(os, encoding);
     }
 
     public void visit(HtmlDocument.TagBlock tagBlock) {
         if (tagBlock.startTag.tagName.toUpperCase().equals("STYLE")) {
         } else if (noButSpace(tagBlock.text())) {
         } else if (!html1BlockTags.contains(tagBlock.startTag.tagName
                 .toUpperCase())) {
             visit(tagBlock.body);
         } else {
             super.visit(tagBlock);
         }
     }
 
     public void visit(HtmlDocument.Tag t) {
         if (html1Tags.contains(t.tagName.toUpperCase())) {
             StringBuffer s = new StringBuffer();
             s.append("<");
             s.append(t.tagName);
             if (!t.tagName.toUpperCase().equals("HTML")) {
                 for (Iterator iterator = t.attributeList.attributes.iterator(); iterator
                         .hasNext();) {
                     Attribute attribute = (Attribute) iterator.next();
                     if (!attribute.name.toUpperCase().equals("STYLE")) {
                         if (!attribute.name.toUpperCase().equals("CLASS")
                                 && !attribute.name.toUpperCase().equals(
                                         "MSONORMAL")) {
                             s.append(" ");
                             s.append(attribute.toString());
                         }
                     }
                 }
             }
             if (t.emptyTag)
                 s.append("/");
             s.append(">");
             out.print(s.toString());
         }
     }
 
     public void visit(HtmlDocument.Comment comment) {
         // System.err.println("in comment" + comment.comment);
     }
 
     boolean noButSpace(String in) {
         String s = in.toLowerCase();
         boolean plausible = true;
         while (plausible)
             if (s.startsWith("&nbsp;"))
                 s = s.substring(6);
             else if (s.startsWith(" "))
                 s = s.substring(1);
             else
                 plausible = false;
         return s.equals("");
     }
 
     /**
      * Runnable.
      */
     public static void main(String[] args) throws Exception {
         InputStream r = new FileInputStream(args[0]);
 
         try {
             HtmlDocument document = new HtmlParser(r).HtmlDocument();
             int scrubberFlags = HtmlScrubber.DEFAULT_OPTIONS
                     | HtmlScrubber.TRIM_SPACES | HtmlScrubber.QUOTE_ATTRS;
 
             document.accept(new HtmlScrubber(scrubberFlags));
             document.accept(new HtmlCollector());
             ByteArrayOutputStream out = new ByteArrayOutputStream();
             document.accept(new HtmlStripper(out));
             InputStream r2 = new ByteArrayInputStream(out.toByteArray());
             HtmlDocument document2 = new HtmlParser(r2).HtmlDocument();
             document2.accept(new HtmlCollector());
             HtmlFormatter formatter = new HtmlFormatter(System.out);
             formatter.setRightMargin(60);
             formatter.setIndent(1);
             document2.accept(formatter);
 
         } finally {
             r.close();
         }
     }
 }

1		/**
2		*
3		*/
4		package com.quiotix.html.parser;
5
6		import java.io.ByteArrayInputStream;
7		import java.io.ByteArrayOutputStream;
8		import java.io.FileInputStream;
9		import java.io.InputStream;
10		import java.io.OutputStream;
11		import java.io.UnsupportedEncodingException;
12		import java.util.HashSet;
13		import java.util.Iterator;
14		import java.util.Set;
15
16		import com.quiotix.html.parser.HtmlDocument.Attribute;
17
18		/**
19		* A runnable class intended to produce readable, sparse html from formatted
20		* pages.
21		*
22		* @author timp
23		*/
24		public class HtmlStripper extends HtmlDumper {
25
26	0	protected static Set html1BlockTags = new HashSet();
27	0	protected static Set html1EmptyTags = new HashSet();
28	0	protected static Set html1Tags = new HashSet();
29	0	protected static String[] html1EmptyTagStrings = { "AREA", "BASE",
30		"BASEFONT", "BR", "COL", "HR", "IMG", "INPUT", "ISINDEX", "LINK",
31		"META", "PARAM", "NEXTID", "PLAINTEXT", };
32	0	protected static String[] html1BlockTagStrings = { "A", "ADDRESS", "B",
33		"BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL",
34		"DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML",
35		"I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE",
36		"SAMP", "STRONG", "TITLE", "TT", "U", "UL", "VAR", "XMP" };
37
38	0	protected static String[] html4BlockTagStrings = { "A", "ADDRESS", "B",
39		"BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL",
40		"DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML",
41		"I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE",
42		"SAMP", "STRONG", "TITLE", "TABLE", "TR", "TH", "TD", "TT", "U",
43		"UL", "VAR", "XMP" };
44
45		static {
46	0	for (int i = 0; i < html1EmptyTagStrings.length; i++)
47	0	html1BlockTags.add(html1EmptyTagStrings[i]);
48		}
49		static {
50	0	for (int i = 0; i < html4BlockTagStrings.length; i++)
51	0	html1BlockTags.add(html4BlockTagStrings[i]);
52		}
53		static {
54	0	for (int i = 0; i < html1EmptyTagStrings.length; i++)
55	0	html1Tags.add(html1EmptyTagStrings[i]);
56	0	for (int i = 0; i < html4BlockTagStrings.length; i++)
57	0	html1Tags.add(html4BlockTagStrings[i]);
58	0	}
59
60		/**
61		* @param os
62		*/
63		public HtmlStripper(OutputStream os) {
64	0	super(os);
65	0	}
66
67		/**
68		* @param os
69		* @param encoding
70		* @throws UnsupportedEncodingException
71		*/
72		public HtmlStripper(OutputStream os, String encoding)
73		throws UnsupportedEncodingException {
74	0	super(os, encoding);
75	0	}
76
77		public void visit(HtmlDocument.TagBlock tagBlock) {
78	0	if (tagBlock.startTag.tagName.toUpperCase().equals("STYLE")) {
79	0	} else if (noButSpace(tagBlock.text())) {
80	0	} else if (!html1BlockTags.contains(tagBlock.startTag.tagName
81		.toUpperCase())) {
82	0	visit(tagBlock.body);
83		} else {
84	0	super.visit(tagBlock);
85		}
86	0	}
87
88		public void visit(HtmlDocument.Tag t) {
89	0	if (html1Tags.contains(t.tagName.toUpperCase())) {
90	0	StringBuffer s = new StringBuffer();
91	0	s.append("<");
92	0	s.append(t.tagName);
93	0	if (!t.tagName.toUpperCase().equals("HTML")) {
94	0	for (Iterator iterator = t.attributeList.attributes.iterator(); iterator
95	0	.hasNext();) {
96	0	Attribute attribute = (Attribute) iterator.next();
97	0	if (!attribute.name.toUpperCase().equals("STYLE")) {
98	0	if (!attribute.name.toUpperCase().equals("CLASS")
99		&& !attribute.name.toUpperCase().equals(
100		"MSONORMAL")) {
101	0	s.append(" ");
102	0	s.append(attribute.toString());
103		}
104		}
105	0	}
106		}
107	0	if (t.emptyTag)
108	0	s.append("/");
109	0	s.append(">");
110	0	out.print(s.toString());
111		}
112	0	}
113
114		public void visit(HtmlDocument.Comment comment) {
115		// System.err.println("in comment" + comment.comment);
116	0	}
117
118		boolean noButSpace(String in) {
119	0	String s = in.toLowerCase();
120	0	boolean plausible = true;
121	0	while (plausible)
122	0	if (s.startsWith(" "))
123	0	s = s.substring(6);
124	0	else if (s.startsWith(" "))
125	0	s = s.substring(1);
126		else
127	0	plausible = false;
128	0	return s.equals("");
129		}
130
131		/**
132		* Runnable.
133		*/
134		public static void main(String[] args) throws Exception {
135	0	InputStream r = new FileInputStream(args[0]);
136
137		try {
138	0	HtmlDocument document = new HtmlParser(r).HtmlDocument();
139	0	int scrubberFlags = HtmlScrubber.DEFAULT_OPTIONS
140		\| HtmlScrubber.TRIM_SPACES \| HtmlScrubber.QUOTE_ATTRS;
141
142	0	document.accept(new HtmlScrubber(scrubberFlags));
143	0	document.accept(new HtmlCollector());
144	0	ByteArrayOutputStream out = new ByteArrayOutputStream();
145	0	document.accept(new HtmlStripper(out));
146	0	InputStream r2 = new ByteArrayInputStream(out.toByteArray());
147	0	HtmlDocument document2 = new HtmlParser(r2).HtmlDocument();
148	0	document2.accept(new HtmlCollector());
149	0	HtmlFormatter formatter = new HtmlFormatter(System.out);
150	0	formatter.setRightMargin(60);
151	0	formatter.setIndent(1);
152	0	document2.accept(formatter);
153
154		} finally {
155	0	r.close();
156	0	}
157	0	}
158		}