您好,欢迎来到三六零分类信息网!老站,搜索引擎当天收录,欢迎发信息
免费发信息
三六零分类信息网 > 海口分类信息网,免费分类信息发布

字符串 - PHP 敏感词违法关键字检测 算法方案

2024/6/17 0:23:06发布73次查看
已有6000条关键字,分3批次。
一批为替换 replace,一批为遇到需要审核 censor,最后一批为遇到就禁止发布banned。
设计数据表如下:
mysql> desc tbl_censor;+-------------+----------------------+------+-----+---------+----------------+| field | type | null | key | default | extra |+-------------+----------------------+------+-----+---------+----------------+| id | smallint(6) unsigned | no | pri | null | auto_increment || censortype | smallint(6) | no | | 1 | || find | varchar(120) | no | uni | | || replacement | varchar(255) | no | | | || extra | varchar(255) | no | | | || uptime | int(11) | yes | | null | || enable | int(1) | no | | 1 | |+-------------+----------------------+------+-----+---------+----------------+7 rows in set (0.01 sec)

由于有6000多关键字,使用 foreach 的 strstr?还是preg_match ?
追求效率,每小时提交量为10万多文章。
刚刚写的一种:
phpnamespace app\helpers;use app\models\other\censor;use app\models\other\censorlog;class censorhelper{ public $id; public $data; public $match_banned; public $match_censor; public function __construct($id = 'censor') { $this->id = $id; $this->match_banned = []; $this->match_censor = []; $this->data = $this->getdata(); } /** * @description 获取正则表达式 * @return array|mixed */ public function getdata() { $data = \yii::$app->cache->get($this->id); if (empty($data)) { $words = censor::find() ->where(['enable' => 1]) ->andwhere([' != ', 'replacement', '']) ->orderby(['replacement' => sort_asc, 'find' => sort_desc]) ->asarray() ->all(); $censor = []; $banned = []; $replace = []; foreach ($words as $row) { switch ($row['replacement']) { case '{censor}': $censor[] = $row['find']; break; case '{banned}': $banned[] = $row['find']; break; default: $replace['from'][] = $row['replacement']; $replace['to'][] = $row['find']; break; } } if ($censor || $banned) { $data = [ 'censor' => $this->generateregularexpression($censor), 'banned' => $this->generateregularexpression($banned), 'replace' => $replace, ]; \yii::$app->cache->set($this->id, $data); } } return $data; } /** * @describe 生成正则表达式 * @param array $words * @return string */ public function generateregularexpression(array $words) { $regular = implode('|', array_map('preg_quote', $words)); return /$regular/i; } public function check($string) { $this->banned($string); $this->censor($string); } public function censor($string) { if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) { $this->match_censor = array_merge($this->match_censor, $matches[0]); } } public function banned($string) { if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) { $this->match_banned = array_merge($this->match_banned, $matches[0]); } } //重新加载 public function flush() { \yii::$app->cache->delete($this->id); $this->getdata(); } /** * @describe 替换 * @param $string * @return mixed */ public function replace($string) { return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string; } /** * @return string */ public function getlevel() { if (!empty($this->match_banned)) { return 'banned'; } else if (!empty($this->match_censor)) { return 'censor'; } else { return 'pass'; } } /** * @describe 添加记录 * @param $tableid * @param $dataid */ public function addlog($tableid, $dataid) { $log = new censorlog(); $log->datatb = $tableid; $log->dataid = $dataid; $log->matchcensor = implode(',', $this->match_censor); $log->matchbanned = implode(',', $this->match_banned); $log->addtime = time(); if (!\yii::$app->user->isguest) { $log->uid = \yii::$app->user->getid(); $log->uname = \yii::$app->user->getuname(); } $log->ip = iphelper::getip(); $log->iploc = iphelper::getlocation($log->ip); $log->save(); }}

回复内容: 已有6000条关键字,分3批次。
一批为替换 replace,一批为遇到需要审核 censor,最后一批为遇到就禁止发布banned。
设计数据表如下:
mysql> desc tbl_censor;+-------------+----------------------+------+-----+---------+----------------+| field | type | null | key | default | extra |+-------------+----------------------+------+-----+---------+----------------+| id | smallint(6) unsigned | no | pri | null | auto_increment || censortype | smallint(6) | no | | 1 | || find | varchar(120) | no | uni | | || replacement | varchar(255) | no | | | || extra | varchar(255) | no | | | || uptime | int(11) | yes | | null | || enable | int(1) | no | | 1 | |+-------------+----------------------+------+-----+---------+----------------+7 rows in set (0.01 sec)

由于有6000多关键字,使用 foreach 的 strstr?还是preg_match ?
追求效率,每小时提交量为10万多文章。
刚刚写的一种:
phpnamespace app\helpers;use app\models\other\censor;use app\models\other\censorlog;class censorhelper{ public $id; public $data; public $match_banned; public $match_censor; public function __construct($id = 'censor') { $this->id = $id; $this->match_banned = []; $this->match_censor = []; $this->data = $this->getdata(); } /** * @description 获取正则表达式 * @return array|mixed */ public function getdata() { $data = \yii::$app->cache->get($this->id); if (empty($data)) { $words = censor::find() ->where(['enable' => 1]) ->andwhere([' != ', 'replacement', '']) ->orderby(['replacement' => sort_asc, 'find' => sort_desc]) ->asarray() ->all(); $censor = []; $banned = []; $replace = []; foreach ($words as $row) { switch ($row['replacement']) { case '{censor}': $censor[] = $row['find']; break; case '{banned}': $banned[] = $row['find']; break; default: $replace['from'][] = $row['replacement']; $replace['to'][] = $row['find']; break; } } if ($censor || $banned) { $data = [ 'censor' => $this->generateregularexpression($censor), 'banned' => $this->generateregularexpression($banned), 'replace' => $replace, ]; \yii::$app->cache->set($this->id, $data); } } return $data; } /** * @describe 生成正则表达式 * @param array $words * @return string */ public function generateregularexpression(array $words) { $regular = implode('|', array_map('preg_quote', $words)); return /$regular/i; } public function check($string) { $this->banned($string); $this->censor($string); } public function censor($string) { if (!empty($this->data['censor']) && preg_match($this->data['censor'], $string, $matches)) { $this->match_censor = array_merge($this->match_censor, $matches[0]); } } public function banned($string) { if (!empty($this->data['banned']) && preg_match($this->data['banned'], $string, $matches)) { $this->match_banned = array_merge($this->match_banned, $matches[0]); } } //重新加载 public function flush() { \yii::$app->cache->delete($this->id); $this->getdata(); } /** * @describe 替换 * @param $string * @return mixed */ public function replace($string) { return !empty($this->data['replace']) ? str_replace($this->data['replace']['from'], $this->data['replace']['to'], $string) : $string; } /** * @return string */ public function getlevel() { if (!empty($this->match_banned)) { return 'banned'; } else if (!empty($this->match_censor)) { return 'censor'; } else { return 'pass'; } } /** * @describe 添加记录 * @param $tableid * @param $dataid */ public function addlog($tableid, $dataid) { $log = new censorlog(); $log->datatb = $tableid; $log->dataid = $dataid; $log->matchcensor = implode(',', $this->match_censor); $log->matchbanned = implode(',', $this->match_banned); $log->addtime = time(); if (!\yii::$app->user->isguest) { $log->uid = \yii::$app->user->getid(); $log->uname = \yii::$app->user->getuname(); } $log->ip = iphelper::getip(); $log->iploc = iphelper::getlocation($log->ip); $log->save(); }}

trie 树算法最适合。
php 关键词过滤扩展,该扩展依赖于 libdatrie(trie 算法的 c++ 实现)。
你这个敏感词匹配,不需要用到正则,只用简单的匹配或者替换就行了。
关键字分成三类存memcached。
然后对文章进行匹配,应该从最严厉的banned来匹配,接着是要censor的关键字,最后才是可以replace的敏感词。
1 遇到就禁止发布 => str_pos
2 遇到需要审核 => str_pos
3 替换 => str_replace
海口分类信息网,免费分类信息发布

VIP推荐

免费发布信息,免费发布B2B信息网站平台 - 三六零分类信息网 沪ICP备09012988号-2
企业名录