We have a C2C website and we discourage selling branded products on our website. We have built a database of brand words such as Nike and D&G and made an algorithm that filters product information for these words and disables products if it contains these words.
Our current algorithm removes all white space and special characters from provided text and matches text with word from database. These cases are required to be caught by algorithm and are caught efficiently:
Now the problem is that it also catches following:
What can be done to prevent such false matches while preserving efficiency with catching true cases?
EDIT
Here's the code for those of you who understand code better:
$orignal_txt = preg_replace('/&.{0,}?;/', '', (strip_tags($orignal_txt)));
$orignal_txt_nospace = preg_replace('/\W/', '', $orignal_txt);
{
$qry_kws = array("nike", "iphone", "d&g");
foreach($qry_kws as $rs_kw)
{
$no_space_db_kw = preg_replace('/\W/', '', $rs_kw);
if(stristr($orignal_txt_nospace, $rs_kw))
{
$ipr_banned_keywords[] = strtolower($rs_kw);
}
else if(stristr($orignal_txt_nospace, $no_space_db_kw))
{
$ipr_banned_keywords[] = strtolower($rs_kw);
}
}
}
Just playing around .... (Not to be used in production)
$data = array(
"i am nike world",
"i have n ikee shoes",
"i have nikeeshoes",
"i sell i-phone casings",
"i sell iphone-casings",
"you can have iphone",
"rapiD Garment factor",
"rosNIK Electronics",
"Buy you self N I K E",
"B*U*Y I*P*H*O*N*E BABY",
"My Phone Is not available");
$ban = array("nike","d&g","iphone");
Example 1:
$filter = new BrandFilterIterator($data);
$filter->parseBan($ban);
foreach ( $filter as $word ) {
echo $word, PHP_EOL;
}
Output 1
rapiD Garment factor
rosNIK Electronics
My Phone Is not available
Example 2
$filter = new BrandFilterIterator($data,true); //reverse filter
$filter->parseBan($ban);
foreach ( $filter as $word ) {
echo $word, " " , json_encode($word->getBan()) , PHP_EOL;
}
Output 2
i am nike world ["nike"]
i have n ikee shoes ["nike"]
i have nikeeshoes ["nike"]
i sell i-phone casings ["iphone"]
i sell iphone-casings ["iphone"]
you can have iphone ["iphone"]
Buy you self N I K E ["nike"]
B*U*Y I*P*H*O*N*E BABY ["iphone"]
Class Used
class BrandFilterIterator extends FilterIterator {
private $words = array();
private $reverse = false;
function __construct(array $words, $reverse = false) {
$this->reverse = $reverse;
foreach ( $words as $word ) {
$this->words[] = new Word($word);
}
parent::__construct(new ArrayIterator($this->words));
}
function parseBan(array $ban) {
foreach ( $ban as $item ) {
foreach ( $this->words as $word ) {
$word->checkMetrix($item);
}
}
}
public function accept() {
if ($this->reverse) {
return $this->getInnerIterator()->current()->accept() ? false : true;
}
return $this->getInnerIterator()->current()->accept();
}
}
class Word {
private $ban = array();
private $word;
private $parts;
private $accept = true;
function __construct($word) {
$this->word = $word;
$this->parts = explode(" ", $word);
}
function __toString() {
return $this->word;
}
function getTrim() {
return preg_replace('/\W/', '', $this->word);
}
function accept() {
return $this->accept;
}
function getBan() {
return array_unique($this->ban);
}
function reject($ban = null) {
$ban === null or $this->ban[] = $ban;
$this->accept = false;
return $this->accept;
}
function checkMetrix($ban) {
foreach ( $this->parts as $part ) {
$part = strtolower($part);
$ban = strtolower($ban);
$t = ceil(strlen(strtolower($ban)) / strlen($part) * 100);
$s = similar_text($part, $ban, $p);
$l = levenshtein($part, $part);
if (ceil($p) >= $t || ($t == 100 && $p >= 75 && $l == 0)) {
$this->reject($ban);
}
}
// Detect Bad Use of space
if (ceil(strlen($this->getTrim()) / strlen($this->word) * 100) < 75) {
if (stripos($this->getTrim(), $ban) !== false) {
$this->reject($ban);
}
}
return $this->accept;
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With