提取HTML代码中文字的C#函数
/// <summary>
/// 去除HTML标记
/// </summary>
/// <param name="strHtml">包括HTML的源码 </param>
/// <returns>已经去除后的文字</returns>
public static string StripHTML(string strHtml)
{
string [] aryReg ={
@"<script[^>]*?>.*?</script>",
@"<(\/\s*)?!?((\w :)?\w )(\w (\s*=?\s*(([""'])(
[url=file://[""]\\[""'tbnr]|[^\7])*?\7|\w )|.{0})|\s)*?(\/\s[/url]
*)?>",
@"([\r\n])[\s] ",
@"&;amp;(quot|#34);",
@"&;amp;(amp|#38);",
@"&;amp;(lt|#60);",
@"&;amp;(gt|#62);",
@"&;amp;(nbsp|#160);",
@"&;amp;(iexcl|#161);",
@"&;amp;(cent|#162);",
@"&;amp;(pound|#163);",
@"&;amp;(copy|#169);",
@"&;amp;#(\d );",
@"-->",
@"<!--.*\n"
};
string [] aryRep = {
"",
"",
"",
"\"",
"&;amp;",
"<",
">",
" ",
"\xa1",//chr(161),
"\xa2",//chr(162),
"\xa3",//chr(163),
"\xa9",//chr(169),
"",
"\r\n",
""
};
string newReg =aryReg[0];
string strOutput=strHtml;
for(int i = 0;i<aryReg.Length;i )
{
Regex regex = new Regex(aryReg
,RegexOptions.IgnoreCase );
strOutput = regex.Replace(strOutput,aryRep);
}
strOutput.Replace("<","");
strOutput.Replace(">","");
strOutput.Replace("\r\n","");
return strOutput;
}
(2005-9-28:10:26) 您可能对 [RegExp] 的这些文章也感兴趣: