
Systemjudge
-
个人空间
相册
- 性别:
- 来自:
- 积分:6204
- 帖子:6032
- 注册:
2007-04-10
|
提取HTML代码中文字的C#函数
提取HTML代码中文字的C#函数 /// <summary> /// 去除HTML标记 /// </summary> /// <param name="strHtml">包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> public static string StripHTML(string strHtml) { string [] aryReg ={ @"<script[^>]*?>.*?</script>", @"<(\/\s*)?!?((\w :)?\w )(\w (\s*=?\s*(([""'])( [url=file://[""]\\[""'tbnr]|[^\7])*?\7|\w )|.{0})|\s)*?(\/\s[/url] *)?>", @"([\r\n])[\s] ", @"&;amp;(quot|#34);", @"&;amp;(amp|#38);", @"&;amp;(lt|#60);", @"&;amp;(gt|#62);", @"&;amp;(nbsp|#160);", @"&;amp;(iexcl|#161);", @"&;amp;(cent|#162);", @"&;amp;(pound|#163);", @"&;amp;(copy|#169);", @"&;amp;#(\d );", @"-->", @"<!--.*\n" }; string [] aryRep = { "", "", "", "\"", "&;amp;", "<", ">", " ", "\xa1",//chr(161), "\xa2",//chr(162), "\xa3",//chr(163), "\xa9",//chr(169), "", "\r\n", "" }; string newReg =aryReg[0]; string strOutput=strHtml; for(int i = 0;i<aryReg.Length;i ) { Regex regex = new Regex(aryReg ,RegexOptions.IgnoreCase ); strOutput = regex.Replace(strOutput,aryRep); } strOutput.Replace("<",""); strOutput.Replace(">",""); strOutput.Replace("\r\n","");
return strOutput; } (2005-9-28:10:26)
| 感谢原创者的辛勤劳动,希望对您有所帮助,转载请注明原出处。 |
|