C# 敏感词过滤算法实现-敏感词过滤算法

本文转载自微信公众号「UP技术控」，作者conan 。转载本文请联系UP技术控公众号。

敏感词、文字过滤是一个网站必不可少的功能，如何设计一个好的、高效的过滤算法是非常有必要的。

在实现文字过滤的算法中，DFA是唯一比较好的实现算法。DFA即Deterministic Finite Automaton，也就是确定有穷自动机，它是是通过event和当前的state得到下一个state，即event+state=nextstate。在实现敏感词过滤的算法中，我们必须要减少运算，而DFA在DFA算法中几乎没有什么计算，有的只是状态的转换。

下面看下在c#方法下实现方式

1、构建敏感词库类

private bool LoadDictionary() 
       { 
           var wordList = new List<string>(); 
           if (_memoryLexicon == null) 
           { 
               _memoryLexicon = new WordGroup[char.MaxValue]; 
               var words = new SensitiveWordBll().GetAllWords(); 
               if (words == null) 
                   return false; 
               foreach (string word in words) 
               { 
                   wordList.Add(word); 
                   var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word, 
                       Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0); 
                   if (word != chineseWord) 
                       wordList.Add(chineseWord); 
               } 
               foreach (var word in wordList) 
               { 
                   if (word.Length > 0) 
                   { 
                       var group = _memoryLexicon[word[0]]; 
                       if (group == null) 
                       { 
                           group = new WordGroup(); 
                           _memoryLexicon[word[0]] = group; 
                       } 
                       group.Add(word.Substring(1)); 
                   } 
               } 
           } 
           return true; 
       }

2、构建敏感词检测类

private bool Check(string blackWord) 
     { 
         _wordlenght = 0; 
         //检测源下一位游标 
         _nextCursor = _cursor + 1; 
         var found = false; 
         var continueCheck = 0; 
         //遍历词的每一位做匹配 
         for (var i = 0; i < blackWord.Length; i++) 
         { 
             //特殊字符偏移游标 
             var offset = 0; 
             if (_nextCursor >= _sourceText.Length) 
             { 
                 if (i - 1 < blackWord.Length - 1) 
                     found = false; 
                 break; 
             } 
             else 
             { 
                 //检测下位字符如果不是汉字 数字 字符 偏移量加1 
                 for (var y = _nextCursor; y < _sourceText.Length; y++) 
                 { 
                     if (!IsChs(_sourceText[y]) && !IsNum(_sourceText[y]) && !IsAlphabet(_sourceText[y])) 
                     { 
                         offset++; 
                         //避让特殊字符，下位游标如果>=字符串长度 跳出 
                         if (_nextCursor + offset >= _sourceText.Length) 
                             break; 
                         _wordlenght++; 
                     } 
                     else break; 
                 } 
                 if (_nextCursor + offset >= _sourceText.Length) 
                 { 
                     found = false; 
                     break; 
                 } 
                 if (blackWord[i] == _sourceText[_nextCursor + offset]) 
                 { 
                     found = true; 
                     continueCheck = 0; 
                 } 
                 else 
                 { 
                     // 匹配不到时尝试继续匹配4个字符 
                     if (continueCheck < 4 && _nextCursor < _sourceText.Length - 1) 
                     { 
                         continueCheck++; 
                         i--; 
                     } 
                     else 
                     { 
                         found = false; 
                         break; 
                     } 
                 } 
             } 
             _nextCursor = _nextCursor + 1 + offset; 
             _wordlenght++; 
         } 
         return found; 
     } 
 }

3、测试与使用方法

_illegalWords = new List<string>(); 
          if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText)) 
          { 
              return sourceText; 
          } 
 
          if (!string.IsNullOrEmpty(sourceText)) 
              _sourceText = sourceText; 
          _cursor = 0; 
          if (!LoadDictionary()) 
          { 
              return _sourceText; 
          } 
 
          var tempString = _sourceText.ToCharArray(); 
          var sourceTextDbc = ToDBC(SourceText); 
          for (var i = 0; i < SourceText.Length; i++) 
          { 
              //查询以该字为首字符的词组 
              var group = _memoryLexicon[sourceTextDbc[i]]; 
              if (group != null) 
              { 
                  for (var z = 0; z < group.Count(); z++) 
                  { 
                      string word = group.GetWord(z); 
                      if (word.Length == 0 || Check(word)) 
                      { 
                          if (isFirstCheckedReturn) 
                          { 
                              return null; 
                          } 
 
                          var blackword = string.Empty; 
                          for (var pos = 0; pos < _wordlenght + 1; pos++) 
                          { 
                              blackword += tempString[pos + _cursor].ToString(); 
                              tempString[pos + _cursor] = ReplaceChar; 
                          } 
                          _illegalWords.Add(blackword); 
 
                          _cursor = _cursor + _wordlenght; 
                          i = i + _wordlenght; 
                          break; 
                      } 
                  } 
              } 
              _cursor++; 
          } 
          return new string(tempString);

var filter = new SensitiveWordFilter(); 
           filter.SourceText = "dddddd"; 
           var sourctText = filter.SourceText; 
           filter.ResetMemoryLexicon(); 
           var datetime = DateTime.Now; 
           var ss = filter.Filter(); 
           var datetime2 = DateTime.Now; 
           var millisecond = (datetime2 - datetime).TotalMilliseconds; 
           Console.WriteLine(millisecond); 
           Console.WriteLine(ss); 
           var words = System.IO.File.ReadAllLines(@"D:\Recv\敏感词库大全.txt", System.Text.Encoding.UTF8); 
           var ssx = sourctText; 
           var datetimex = DateTime.Now; 
           foreach (var word in words) 
           { 
               if (word.Length > 0) 
                   ssx = ssx.Replace(word, "*".PadLeft(word.Length, '*')); 
           } 
           var datetime2x = DateTime.Now; 
           var millisecondx = (datetime2x - datetimex).TotalMilliseconds; 
           Console.WriteLine(millisecondx); 
           Console.WriteLine(ssx);