提取HTML的正文类
本文转载:http://blog.csdn.net/cjh200102/article/details/6824895
//2、提取html的正文 类 using
System; using
System.Text; namespace
HtmlStrip { class
MainClass { public
static
void
Main ( string [] args) { string
str = "<div>abc</div><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo" ; //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html"); //str=rd.ReadToEnd (); HtmlParser t = new
HtmlParser (str); // t.KeepTag ( new
string [] { "br"
}); //设置br标签不过虑 Console.Write (t.Text ()); } } class
HtmlParser { private
string [] htmlcode; //把html转为数组形式用于分析 private
StringBuilder result = new
StringBuilder (); //输出的结果 private
int
seek; //分析文本时候的指针位置 private
string [] keepTag; //用于保存要保留的尖括号内容 private
bool
_inTag; //标记现在的指针是不是在尖括号内 private
bool
needContent = true ; //是否要提取正文 private
string
tagName; //当前尖括号的名字 private
string [] specialTag = new
string [] { "script" , "style" , "!--"
}; //特殊的尖括号内容,一般这些标签的正文是不要的 /// <summary> /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字 /// </summary> public
bool
inTag { get
{ return
_inTag; } set
{ _inTag = value; if
(!value) return ; bool
ok = true ; tagName = "" ; while
(ok) { string
word = read (); if
(word != " "
&& word != ">" ) { tagName += word; } else
if
(word == " "
&& tagName.Length > 0) { ok = false ; } else
if
(word == ">" ) { ok = false ; inTag = false ; seek -= 1; } } } } /// <summary> /// 初始化类 /// </summary> /// <param name="html"> /// 要分析的html代码 /// </param> public
HtmlParser ( string
html) { htmlcode = new
string [html.Length]; for
( int
i = 0; i < html.Length; i++) { htmlcode[i] = html[i].ToString (); } KeepTag ( new
string [] { }); } /// <summary> /// 设置要保存那些标签不要被过滤掉 /// </summary> /// <param name="tags"> /// /// </param> public
void
KeepTag ( string [] tags) { keepTag = tags; } /// <summary> /// /// </summary> /// <returns> /// 输出处理后的文本 /// </returns> public
string
Text () { int
startTag = 0; int
endTag = 0; while
(seek < htmlcode.Length) { string
word = read (); if
(word.ToLower () == "<" ) { startTag = seek; inTag = true ; } else
if
(word.ToLower () == ">" ) { endTag = seek; inTag = false ; if
(iskeepTag (tagName.Replace ( "/" , "" ))) { for
( int
i = startTag - 1; i < endTag; i++) { result.Append (htmlcode[i].ToString ()); } } else
if
(tagName.StartsWith ( "!--" )) { bool
ok = true ; while
(ok) { if
(read () == "-" ) { if
(read () == "-" ) { if
(read () == ">" ) { ok = false ; } else
{ seek -= 1; } } } } } else
{ foreach
( string
str in
specialTag) { if
(tagName == str) { needContent = false ; break ; } else needContent = true ; } } } else
if
(!inTag && needContent) { result.Append (word); } } return
result.ToString (); } /// <summary> /// 判断是否要保存这个标签 /// </summary> /// <param name="tag"> /// A <see cref="System.String"/> /// </param> /// <returns> /// A <see cref="System.Boolean"/> /// </returns> private
bool
iskeepTag ( string
tag) { foreach
( string
ta in
keepTag) { if
(tag.ToLower () == ta.ToLower ()) { return
true ; } } return
false ; } private
string
read () { return
htmlcode[seek++]; } } } |
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。