Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions dev/company_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,19 @@

from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
from simstring.measure.cosine import (
CosineMeasure,
) # , OverlapMeasure, LeftOverlapMeasure
CosineMeasure
)
from simstring.measure.left_cosine import (
LeftCosineMeasure as CosineMeasure
)

# from simstring.database.mongo import MongoDatabase
from simstring.database.dict import DictDatabase
from simstring.searcher import Searcher

from pyinstrument import Profiler

profiler = Profiler()


def output_similar_strings_of_each_line(path, measure):
strings = []
with open(path, "r") as lines:
Expand Down Expand Up @@ -48,9 +49,3 @@ def output_similar_strings_of_each_line(path, measure):

measure = CosineMeasure()
output_similar_strings_of_each_line("dev/data/company_names.txt", measure)

# measure = OverlapMeasure()
# output_similar_strings_of_each_line("dev/data/company_names.txt", measure)

# measure = LeftOverlapMeasure()
# output_similar_strings_of_each_line("./data/company_names.txt", measure)
20 changes: 20 additions & 0 deletions simstring/measure/left_cosine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import math
from typing import Iterable


class LeftCosineMeasure:
def min_feature_size(self, query_size: int, alpha: float) -> int:
return int(math.ceil(alpha * alpha * query_size))

def max_feature_size(self, query_size: int, alpha: float) -> int:
return int(math.floor(query_size / (alpha * alpha)))

def minimum_common_feature_count(
self, query_size: int, y_size: int, alpha: float
) -> int:
# breakpoint()
return int(math.ceil(alpha * math.sqrt(query_size * y_size)))

def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
# breakpoint()
return len(set(X) & set(Y)) /len(set(X))
8 changes: 7 additions & 1 deletion tests/measure/test_cosine.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,10 @@ def test_similarity(self):
]
self.assertEqual(
round(self.measure.similarity(a, b), 3), 0.788
) # BUG? Disagrees with paper that claims should be 0.788
)

name = ["Vl", "la", "ad", "di", "im", "mi", "ir", "r ", " P", "Pu", "ut", "ti", "in"]
sentence = ["Do", "on", "na", "at", "ti", "io", "on", "n ", " t", "to", "o ", " t", "he", "e ", " g", "gr", "re", "ea", "at", "t ", " V", "Vl", "la", "ad", "di", "im", "mi", "ir", "r ", " P", "Pu", "ut", "ti", "in", "n ", " m", "my", "y ", " f", "fa", "av", "vo", "or", "ri", "it", "te", "e ", " p", "pr", "re", "es", "si", "id", "de", "en", "nt"]
self.assertEqual(
round(self.measure.similarity(name, sentence), 2), 0.52
)
80 changes: 80 additions & 0 deletions tests/measure/test_left_cosine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# -*- coding:utf-8 -*-

from unittest import TestCase
from simstring.measure.left_cosine import LeftCosineMeasure


class TestLeftCosineMeasure(TestCase):
measure = LeftCosineMeasure()

def test_min_feature_size(self):
self.assertEqual(self.measure.min_feature_size(5, 1.0), 5)
self.assertEqual(self.measure.min_feature_size(5, 0.5), 2)

def test_max_feature_size(self):
self.assertEqual(self.measure.max_feature_size(5, 1.0), 5)
self.assertEqual(self.measure.max_feature_size(5, 0.5), 20)

def test_minimum_common_feature_count(self):
self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 5)
self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 10)
self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3)

def test_similarity(self):
x = ["a", "ab", "bc", "c"]
y = ["a", "ab", "bc", "cd", "e"]
self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0)
self.assertEqual(round(self.measure.similarity(x, y), 2), 0.75)

z = ["a", "ab", "ba", "ab", "a"]
self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0)
self.assertEqual(round(self.measure.similarity(x, z), 2), 0.5)
self.assertEqual(round(self.measure.similarity(x, y), 2), 0.75)

# Test as per paper trigrams with quotes of methyl sulphone and methyl sulfone
a = [
' "m',
'"me',
"met",
"eth",
"thy",
"hyl",
"yl ",
"l s",
" su",
"sul",
"ulf",
"lfo",
"fon",
"one",
'ne"',
'e" ',
]
b = [
' "m',
'"me',
"met",
"eth",
"thy",
"hyl",
"yl ",
"l s",
" su",
"sul",
"ulp",
"lph",
"pho",
"hon",
"one",
'ne"',
'e" ',
]
self.assertEqual(
round(self.measure.similarity(a, b), 3), 0.812
)

name = ["Vl", "la", "ad", "di", "im", "mi", "ir", "r ", " P", "Pu", "ut", "ti", "in"]
sentence = ["Do", "on", "na", "at", "ti", "io", "on", "n ", " t", "to", "o ", " t", "he", "e ", " g", "gr", "re", "ea", "at", "t ", " V", "Vl", "la", "ad", "di", "im", "mi", "ir", "r ", " P", "Pu", "ut", "ti", "in", "n ", " m", "my", "y ", " f", "fa", "av", "vo", "or", "ri", "it", "te", "e ", " p", "pr", "re", "es", "si", "id", "de", "en", "nt"]
self.assertEqual(
round(self.measure.similarity(name, sentence), 2), 1.0
)
23 changes: 23 additions & 0 deletions tests/test_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from simstring.searcher import Searcher
from simstring.database.dict import DictDatabase
from simstring.measure.cosine import CosineMeasure
from simstring.measure.left_cosine import LeftCosineMeasure
from simstring.measure.jaccard import JaccardMeasure
from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor

Expand Down Expand Up @@ -81,6 +82,28 @@ def test_ranked_search_example2(self):
self.assertEqual(results, goal)


class TestSearchLeftCosine(TestCase):
def setUp(self) -> None:
db = DictDatabase(CharacterNgramFeatureExtractor(2))
db.add("vladimir putin")
db.add("isis")
db.add("kim jong un")
db.add("isabel jose dos santos")
self.searcher = Searcher(db, LeftCosineMeasure())

def test_search(self) -> None:
self.assertEqual(self.searcher.search("vlad putin", 0.7), ["vladimir putin"])
self.assertEqual(self.searcher.search("donation for the benefit of make great again vlad putin", 0.7), ["vladimir putin"])
breakpoint()
self.assertEqual(self.searcher.search("isis medical center", 0.7), ["isis"])

self.assertEqual(self.searcher.search("donation for the benefit of make great again isis", 0.5), ["vladimir putin"])


self.assertEqual(self.searcher.search("isabel santos", 0.7), ["isabel jose dos santos"])
self.assertEqual(self.searcher.search("donation for the benefit of make great again vlad putin", 0.5), ["vladimir putin"])


class TestRankedSearchCosineLong(TestCase):
def setUp(self) -> None:
db = DictDatabase(CharacterNgramFeatureExtractor(2))
Expand Down