Regex.Replace 方法的性能!
園子里有很多關于去除Html標簽的文章。一個常用的經驗是使用 Regex.Replace 方法利用正則去替換。這里有一篇使用該方法的文章 C#中如何去除HTML標記 。下面我貼出該方法的代碼,見代碼清單1-1
代碼清單1-1 引用 http://www.cnblogs.com/zoupeiyang/archive/2009/06/22/1508039.html
///
/// 去除HTML標記
///
/// 包括HTML的源碼
/// 已經去除后的文字
public static string ReplaceHtmlTag(string Htmlstring)
{
//刪除腳本
Htmlstring = Htmlstring.Replace("\r\n", "");
Htmlstring = Regex.Replace(Htmlstring, @"", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"",@"", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<.*?>", "", RegexOptions.IgnoreCase);
//刪除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"",@"",@"<.*?>",
@"<(.[^>]*)>",@"([\r\n])[\s]+",@"-->",
@"",@"&(quot|#34);",@"&(amp|#38);",
@"&(lt|#60);",@"&(gt|#62);",@"&(nbsp|#160);",
@"&(iexcl|#161);",@"&(cent|#162);",@"&(pound|#163);",
@"&(copy|#169);",@"&(\d+);"
};
string[] replacement = new string[]
{
"","","","","","","","\"","&","<",">","","\xa1","\xa2","\xa3","\xa9",""
};
#endregion
if (pattern.Length != replacement.Length)
{
throw new Exception("正則表達式數組和替換后的字符數組的長度不一致!");
}
int count = 0; //計數器
foreach (string str in pattern)
{
Regex aRegex = new Regex(str);
aReplaceHtml.AddRegex(aRegex, replacement[count]);
count += 1;
}
return aReplaceHtml;
}
///
/// 增加一個Regex對象
///
/// Regex 對象
/// 該對象對應的替換字符串
private void AddRegex(Regex aRegex, string Replacement)
{
_regexs.Add(aRegex);
_replacement.Add(Replacement);
}
}
}
/// 去除HTML標記
///
/// 包括HTML的源碼
/// 已經去除后的文字
public static string ReplaceHtmlTag(string Htmlstring)
{
//刪除腳本
Htmlstring = Htmlstring.Replace("\r\n", "");
Htmlstring = Regex.Replace(Htmlstring, @"", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"",@"", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<.*?>", "", RegexOptions.IgnoreCase);
//刪除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"",@"",@"<.*?>",
@"<(.[^>]*)>",@"([\r\n])[\s]+",@"-->",
@"",@"&(quot|#34);",@"&(amp|#38);",
@"&(lt|#60);",@"&(gt|#62);",@"&(nbsp|#160);",
@"&(iexcl|#161);",@"&(cent|#162);",@"&(pound|#163);",
@"&(copy|#169);",@"&(\d+);"
};
string[] replacement = new string[]
{
"","","","","","","","\"","&","<",">","","\xa1","\xa2","\xa3","\xa9",""
};
#endregion
if (pattern.Length != replacement.Length)
{
throw new Exception("正則表達式數組和替換后的字符數組的長度不一致!");
}
int count = 0; //計數器
foreach (string str in pattern)
{
Regex aRegex = new Regex(str);
aReplaceHtml.AddRegex(aRegex, replacement[count]);
count += 1;
}
return aReplaceHtml;
}
///
/// 增加一個Regex對象
///
/// Regex 對象
/// 該對象對應的替換字符串
private void AddRegex(Regex aRegex, string Replacement)
{
_regexs.Add(aRegex);
_replacement.Add(Replacement);
}
}
}
該類的使用如下,見代碼清單1-7
代碼清單1-7
public static string ReplaceHtmlTag2(string Htmlstring)
{
return ReplaceHtml.Instance.ReplaceHtmlTag(Htmlstring);
}
{
return ReplaceHtml.Instance.ReplaceHtmlTag(Htmlstring);
}
寫到這里讓我們來測試一下,2種方法在性能的差距。經過測試,在重復執行 ReplaceHtmlTag 方法和ReplaceHtmlTag2 方法 10,100,1000 次后,性能相差在 2-15陪左右。具體見圖1-1
圖1-1 2種方法執行 1000 次所消耗的時間對比
說明:該方法在處理短字符串時,性能差距很大。我用新浪的首頁做過測試,2種方法的性能只相差1倍。附上源代碼,感興趣的讀者可自行測試!:-)
這里下載: RegexTest.rar
End.
0
0
全站熱搜