| 1 | |
|
| 2 | |
|
| 3 | |
|
| 4 | |
package com.quiotix.html.parser; |
| 5 | |
|
| 6 | |
import java.io.ByteArrayInputStream; |
| 7 | |
import java.io.ByteArrayOutputStream; |
| 8 | |
import java.io.FileInputStream; |
| 9 | |
import java.io.InputStream; |
| 10 | |
import java.io.OutputStream; |
| 11 | |
import java.io.UnsupportedEncodingException; |
| 12 | |
import java.util.HashSet; |
| 13 | |
import java.util.Iterator; |
| 14 | |
import java.util.Set; |
| 15 | |
|
| 16 | |
import com.quiotix.html.parser.HtmlDocument.Attribute; |
| 17 | |
|
| 18 | |
|
| 19 | |
|
| 20 | |
|
| 21 | |
|
| 22 | |
|
| 23 | |
|
| 24 | |
public class HtmlStripper extends HtmlDumper { |
| 25 | |
|
| 26 | 0 | protected static Set html1BlockTags = new HashSet(); |
| 27 | 0 | protected static Set html1EmptyTags = new HashSet(); |
| 28 | 0 | protected static Set html1Tags = new HashSet(); |
| 29 | 0 | protected static String[] html1EmptyTagStrings = { "AREA", "BASE", |
| 30 | |
"BASEFONT", "BR", "COL", "HR", "IMG", "INPUT", "ISINDEX", "LINK", |
| 31 | |
"META", "PARAM", "NEXTID", "PLAINTEXT", }; |
| 32 | 0 | protected static String[] html1BlockTagStrings = { "A", "ADDRESS", "B", |
| 33 | |
"BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL", |
| 34 | |
"DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML", |
| 35 | |
"I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE", |
| 36 | |
"SAMP", "STRONG", "TITLE", "TT", "U", "UL", "VAR", "XMP" }; |
| 37 | |
|
| 38 | 0 | protected static String[] html4BlockTagStrings = { "A", "ADDRESS", "B", |
| 39 | |
"BLOCKQUOTE", "BODY", "CITE", "CODE", "DD", "DFN", "DIR", "DL", |
| 40 | |
"DT", "EM", "H1", "H2", "H3", "H4", "H5", "H6", "HEAD", "HTML", |
| 41 | |
"I", "KBD", "KEY", "LI", "LISTING", "MENU", "OL", "P", "PRE", |
| 42 | |
"SAMP", "STRONG", "TITLE", "TABLE", "TR", "TH", "TD", "TT", "U", |
| 43 | |
"UL", "VAR", "XMP" }; |
| 44 | |
|
| 45 | |
static { |
| 46 | 0 | for (int i = 0; i < html1EmptyTagStrings.length; i++) |
| 47 | 0 | html1BlockTags.add(html1EmptyTagStrings[i]); |
| 48 | |
} |
| 49 | |
static { |
| 50 | 0 | for (int i = 0; i < html4BlockTagStrings.length; i++) |
| 51 | 0 | html1BlockTags.add(html4BlockTagStrings[i]); |
| 52 | |
} |
| 53 | |
static { |
| 54 | 0 | for (int i = 0; i < html1EmptyTagStrings.length; i++) |
| 55 | 0 | html1Tags.add(html1EmptyTagStrings[i]); |
| 56 | 0 | for (int i = 0; i < html4BlockTagStrings.length; i++) |
| 57 | 0 | html1Tags.add(html4BlockTagStrings[i]); |
| 58 | 0 | } |
| 59 | |
|
| 60 | |
|
| 61 | |
|
| 62 | |
|
| 63 | |
public HtmlStripper(OutputStream os) { |
| 64 | 0 | super(os); |
| 65 | 0 | } |
| 66 | |
|
| 67 | |
|
| 68 | |
|
| 69 | |
|
| 70 | |
|
| 71 | |
|
| 72 | |
public HtmlStripper(OutputStream os, String encoding) |
| 73 | |
throws UnsupportedEncodingException { |
| 74 | 0 | super(os, encoding); |
| 75 | 0 | } |
| 76 | |
|
| 77 | |
public void visit(HtmlDocument.TagBlock tagBlock) { |
| 78 | 0 | if (tagBlock.startTag.tagName.toUpperCase().equals("STYLE")) { |
| 79 | 0 | } else if (noButSpace(tagBlock.text())) { |
| 80 | 0 | } else if (!html1BlockTags.contains(tagBlock.startTag.tagName |
| 81 | |
.toUpperCase())) { |
| 82 | 0 | visit(tagBlock.body); |
| 83 | |
} else { |
| 84 | 0 | super.visit(tagBlock); |
| 85 | |
} |
| 86 | 0 | } |
| 87 | |
|
| 88 | |
public void visit(HtmlDocument.Tag t) { |
| 89 | 0 | if (html1Tags.contains(t.tagName.toUpperCase())) { |
| 90 | 0 | StringBuffer s = new StringBuffer(); |
| 91 | 0 | s.append("<"); |
| 92 | 0 | s.append(t.tagName); |
| 93 | 0 | if (!t.tagName.toUpperCase().equals("HTML")) { |
| 94 | 0 | for (Iterator iterator = t.attributeList.attributes.iterator(); iterator |
| 95 | 0 | .hasNext();) { |
| 96 | 0 | Attribute attribute = (Attribute) iterator.next(); |
| 97 | 0 | if (!attribute.name.toUpperCase().equals("STYLE")) { |
| 98 | 0 | if (!attribute.name.toUpperCase().equals("CLASS") |
| 99 | |
&& !attribute.name.toUpperCase().equals( |
| 100 | |
"MSONORMAL")) { |
| 101 | 0 | s.append(" "); |
| 102 | 0 | s.append(attribute.toString()); |
| 103 | |
} |
| 104 | |
} |
| 105 | 0 | } |
| 106 | |
} |
| 107 | 0 | if (t.emptyTag) |
| 108 | 0 | s.append("/"); |
| 109 | 0 | s.append(">"); |
| 110 | 0 | out.print(s.toString()); |
| 111 | |
} |
| 112 | 0 | } |
| 113 | |
|
| 114 | |
public void visit(HtmlDocument.Comment comment) { |
| 115 | |
|
| 116 | 0 | } |
| 117 | |
|
| 118 | |
boolean noButSpace(String in) { |
| 119 | 0 | String s = in.toLowerCase(); |
| 120 | 0 | boolean plausible = true; |
| 121 | 0 | while (plausible) |
| 122 | 0 | if (s.startsWith(" ")) |
| 123 | 0 | s = s.substring(6); |
| 124 | 0 | else if (s.startsWith(" ")) |
| 125 | 0 | s = s.substring(1); |
| 126 | |
else |
| 127 | 0 | plausible = false; |
| 128 | 0 | return s.equals(""); |
| 129 | |
} |
| 130 | |
|
| 131 | |
|
| 132 | |
|
| 133 | |
|
| 134 | |
public static void main(String[] args) throws Exception { |
| 135 | 0 | InputStream r = new FileInputStream(args[0]); |
| 136 | |
|
| 137 | |
try { |
| 138 | 0 | HtmlDocument document = new HtmlParser(r).HtmlDocument(); |
| 139 | 0 | int scrubberFlags = HtmlScrubber.DEFAULT_OPTIONS |
| 140 | |
| HtmlScrubber.TRIM_SPACES | HtmlScrubber.QUOTE_ATTRS; |
| 141 | |
|
| 142 | 0 | document.accept(new HtmlScrubber(scrubberFlags)); |
| 143 | 0 | document.accept(new HtmlCollector()); |
| 144 | 0 | ByteArrayOutputStream out = new ByteArrayOutputStream(); |
| 145 | 0 | document.accept(new HtmlStripper(out)); |
| 146 | 0 | InputStream r2 = new ByteArrayInputStream(out.toByteArray()); |
| 147 | 0 | HtmlDocument document2 = new HtmlParser(r2).HtmlDocument(); |
| 148 | 0 | document2.accept(new HtmlCollector()); |
| 149 | 0 | HtmlFormatter formatter = new HtmlFormatter(System.out); |
| 150 | 0 | formatter.setRightMargin(60); |
| 151 | 0 | formatter.setIndent(1); |
| 152 | 0 | document2.accept(formatter); |
| 153 | |
|
| 154 | |
} finally { |
| 155 | 0 | r.close(); |
| 156 | 0 | } |
| 157 | 0 | } |
| 158 | |
} |