From 96d1ee713b3ce4736b0de18617cf61678e2b0be2 Mon Sep 17 00:00:00 2001 From: podhmo Date: Fri, 31 May 2019 22:14:46 +0900 Subject: [PATCH 1/4] feat: impl --- dictknife/query.py | 101 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 dictknife/query.py diff --git a/dictknife/query.py b/dictknife/query.py new file mode 100644 index 00000000..f4890835 --- /dev/null +++ b/dictknife/query.py @@ -0,0 +1,101 @@ +import operator +from collections import defaultdict + + +def _to_accesssor(k): + if callable(k): + return k + elif isinstance(k, (str, bytes)): + return operator.itemgetter(k) + elif isinstance(k, (list, tuple)): + # todo: compile? + return lambda v: tuple([v.get(sk) for sk in k]) + else: + raise ValueError(k) + + +class Options: + def __init__(self, *, missing_value=None, accessor_factory=_to_accesssor) -> None: + self.missing_value = missing_value + self.accessor_factory = accessor_factory + + +_default_options = Options() + + +def how_inner_join(left, right, left_k, right_k, *, options=_default_options): + right_cache = defaultdict(list) + for x in right: + right_cache[right_k(x)].append(x) + + for lv in left: + lk = left_k(lv) + if lk not in right_cache: + continue + for rv in right_cache[lk]: + yield (lv, rv) + + +def how_left_outer_join(left, right, left_k, right_k, *, options=_default_options): + missing_value = options.missing_value + + right_cache = defaultdict(list) + for x in right: + right_cache[right_k(x)].append(x) + + for lv in left: + k = left_k(lv) + if k in right_cache: + for rv in right_cache[k]: + yield (lv, rv) + else: + yield (lv, missing_value) + + +def how_right_outer_join(left, right, left_k, right_k, *, options=_default_options): + return how_left_outer_join(right, left, right_k, left_k, options=options) + + +def how_full_outer_join(left, right, left_k, right_k, *, options=_default_options): + missing_value = options.missing_value + + right_cache = defaultdict(list) + for x in right: + right_cache[right_k(x)].append(x) + + right_used = set() + + for lv in left: + lk = left_k(lv) + if lk in right_cache: + right_used.add(lk) + for rv in right_cache[lk]: + yield (lv, rv) + else: + yield (lv, missing_value) + + for rv in right: + rk = right_k(rv) + if rk in right_used: + continue + yield (missing_value, rv) + + +def join( + left, + right, + *, + left_on=None, + right_on=None, + on=None, + how=how_inner_join, + options=_default_options, +): + assert on or (left_on and right_on) + + if on is not None: + left_on = right_on = on + + left_on_accessor = options.accessor_factory(left_on) + right_on_accessor = options.accessor_factory(right_on) + yield from how(left, right, left_on_accessor, right_on_accessor, options=options) From 17b2d8263a573c7733d1f828f353875fc680be4a Mon Sep 17 00:00:00 2001 From: podhmo Date: Fri, 31 May 2019 22:29:47 +0900 Subject: [PATCH 2/4] tests: add tests --- dictknife/tests/test_query.py | 94 +++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 dictknife/tests/test_query.py diff --git a/dictknife/tests/test_query.py b/dictknife/tests/test_query.py new file mode 100644 index 00000000..996a5fde --- /dev/null +++ b/dictknife/tests/test_query.py @@ -0,0 +1,94 @@ +import unittest +from collections import namedtuple +import itertools +import copy +import os + +COLSIZE = int(os.environ.get("COLSIZE") or "60") + + +class JoinTests(unittest.TestCase): + def _callFUT(self, *args, **kwargs): + from dictknife.query import join + + return list(join(*args, **kwargs)) + + def test_it(self): + class data: + x_packages = [ + {"version": "2.7", "downloads": 1000}, + {"version": "3.5", "downloads": 2000}, + {"version": "3.6", "downloads": 3000}, + {"version": "3.7", "downloads": 3000}, + ] + y_packages = [ + {"version": "3.5", "downloads": 2000}, + {"version": "3.6", "downloads": 2000}, + {"version": "3.7", "downloads": 2000}, + {"version": "3.8", "downloads": 500}, + ] + + class copied: + x_packages = copy.deepcopy(data.x_packages) + y_packages = copy.deepcopy(data.y_packages) + + C = namedtuple("C", "msg, args, kwargs, want") + cases = [ + C( + msg="inner join", + args=["x_packages", "y_packages"], + kwargs={"on": "version"}, + want=[ + ( + {"version": "3.5", "downloads": 2000}, + {"version": "3.5", "downloads": 2000}, + ), + ( + {"version": "3.6", "downloads": 3000}, + {"version": "3.6", "downloads": 200}, + ), + ( + {"version": "3.7", "downloads": 3000}, + {"version": "3.7", "downloads": 2000}, + ), + ], + ) + ] + for c in cases: + with self.subTest(msg=c.msg, args=c.args, kwargs=c.kwargs): + args = [getattr(data, name) for name in c.args] + got = self._callFUT(*args, **c.kwargs) + self.assertTrue( + got == c.want, msg=_DifferenceReportText(got=got, want=c.want) + ) + self.assertEqual(got, c.want) + self.assertTrue(data.x_packages == copied.x_packages, "not modified") + self.assertTrue(data.y_packages == copied.y_packages, "not modified") + + +class _DifferenceReportText: + def __init__(self, *, got, want): + self.got = got + self.want = want + + def __str__(self): + import json + + fmt = "{left:%d}\t{right:%d}" % (COLSIZE, COLSIZE) + r = [ + "", + fmt.format(left="want", right="got"), + "----------------------------------------------------------------------", + ] + for lhs, rhs in itertools.zip_longest(self.want, self.got): + r.append( + fmt.format( + left=json.dumps(lhs, sort_keys=True), + right=json.dumps(rhs, sort_keys=True), + ) + ) + return "\n".join(r) + + +if __name__ == "__main__": + unittest.main() From 2edd68de8e7ee8fdf74cc1d2cf84117414996e1a Mon Sep 17 00:00:00 2001 From: podhmo Date: Fri, 31 May 2019 22:36:09 +0900 Subject: [PATCH 3/4] tests: more tests --- dictknife/query.py | 3 +- dictknife/tests/test_query.py | 67 +++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/dictknife/query.py b/dictknife/query.py index f4890835..a7c84ecb 100644 --- a/dictknife/query.py +++ b/dictknife/query.py @@ -53,7 +53,8 @@ def how_left_outer_join(left, right, left_k, right_k, *, options=_default_option def how_right_outer_join(left, right, left_k, right_k, *, options=_default_options): - return how_left_outer_join(right, left, right_k, left_k, options=options) + for r, l in how_left_outer_join(right, left, right_k, left_k, options=options): + yield l, r def how_full_outer_join(left, right, left_k, right_k, *, options=_default_options): diff --git a/dictknife/tests/test_query.py b/dictknife/tests/test_query.py index 996a5fde..94333c23 100644 --- a/dictknife/tests/test_query.py +++ b/dictknife/tests/test_query.py @@ -14,6 +14,8 @@ def _callFUT(self, *args, **kwargs): return list(join(*args, **kwargs)) def test_it(self): + from dictknife import query + class data: x_packages = [ {"version": "2.7", "downloads": 1000}, @@ -45,14 +47,75 @@ class copied: ), ( {"version": "3.6", "downloads": 3000}, - {"version": "3.6", "downloads": 200}, + {"version": "3.6", "downloads": 2000}, ), ( {"version": "3.7", "downloads": 3000}, {"version": "3.7", "downloads": 2000}, ), ], - ) + ), + C( + msg="left outer join", + args=["x_packages", "y_packages"], + kwargs={"on": "version", "how": query.how_left_outer_join}, + want=[ + ({"version": "2.7", "downloads": 1000}, None), + ( + {"version": "3.5", "downloads": 2000}, + {"version": "3.5", "downloads": 2000}, + ), + ( + {"version": "3.6", "downloads": 3000}, + {"version": "3.6", "downloads": 2000}, + ), + ( + {"version": "3.7", "downloads": 3000}, + {"version": "3.7", "downloads": 2000}, + ), + ], + ), + C( + msg="right outer join", + args=["x_packages", "y_packages"], + kwargs={"on": "version", "how": query.how_right_outer_join}, + want=[ + ( + {"version": "3.5", "downloads": 2000}, + {"version": "3.5", "downloads": 2000}, + ), + ( + {"version": "3.6", "downloads": 3000}, + {"version": "3.6", "downloads": 2000}, + ), + ( + {"version": "3.7", "downloads": 3000}, + {"version": "3.7", "downloads": 2000}, + ), + (None, {"version": "3.8", "downloads": 500}), + ], + ), + C( + msg="full outer join", + args=["x_packages", "y_packages"], + kwargs={"on": "version", "how": query.how_full_outer_join}, + want=[ + ({"version": "2.7", "downloads": 1000}, None), + ( + {"version": "3.5", "downloads": 2000}, + {"version": "3.5", "downloads": 2000}, + ), + ( + {"version": "3.6", "downloads": 3000}, + {"version": "3.6", "downloads": 2000}, + ), + ( + {"version": "3.7", "downloads": 3000}, + {"version": "3.7", "downloads": 2000}, + ), + (None, {"version": "3.8", "downloads": 500}), + ], + ), ] for c in cases: with self.subTest(msg=c.msg, args=c.args, kwargs=c.kwargs): From daadc0824407bf7877d9bc755e07d62b0ee0b820 Mon Sep 17 00:00:00 2001 From: podhmo Date: Fri, 31 May 2019 22:37:45 +0900 Subject: [PATCH 4/4] tests: multi keys tests --- dictknife/tests/test_query.py | 103 ++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/dictknife/tests/test_query.py b/dictknife/tests/test_query.py index 94333c23..c11210a0 100644 --- a/dictknife/tests/test_query.py +++ b/dictknife/tests/test_query.py @@ -128,6 +128,109 @@ class copied: self.assertTrue(data.x_packages == copied.x_packages, "not modified") self.assertTrue(data.y_packages == copied.y_packages, "not modified") + def test_multi_keys(self): + class data: + classes = [ + {"id": 1, "year": "1", "name": "A"}, + {"id": 2, "year": "1", "name": "B"}, + {"id": 3, "year": "1", "name": "C"}, + {"id": 4, "year": "2", "name": "A"}, + {"id": 5, "year": "2", "name": "B"}, + ] + students = [ + {"id": 1, "year": "1", "class": "A", "cid": 1, "name": "foo"}, + {"id": 2, "year": "1", "class": "A", "cid": 1, "name": "bar"}, + {"id": 3, "year": "1", "class": "B", "cid": 2, "name": "boo"}, + {"id": 4, "year": "1", "class": "C", "cid": 3, "name": "yoo"}, + ] + + class copied: + classes = copy.deepcopy(data.classes) + students = copy.deepcopy(data.students) + + C = namedtuple("C", "msg, args, kwargs, want") + cases = [ + C( + msg="inner join", + args=[data.classes, data.students], + kwargs={"left_on": "id", "right_on": "cid"}, + want=[ + ( + {"id": 1, "year": "1", "name": "A"}, + {"id": 1, "year": "1", "class": "A", "cid": 1, "name": "foo"}, + ), + ( + {"id": 1, "year": "1", "name": "A"}, + {"id": 2, "year": "1", "class": "A", "cid": 1, "name": "bar"}, + ), + ( + {"id": 2, "year": "1", "name": "B"}, + {"id": 3, "year": "1", "class": "B", "cid": 2, "name": "boo"}, + ), + ( + {"id": 3, "year": "1", "name": "C"}, + {"id": 4, "year": "1", "class": "C", "cid": 3, "name": "yoo"}, + ), + ], + ), + C( + msg="inner join with multi keys", + args=[data.classes, data.students], + kwargs={"left_on": ("year", "name"), "right_on": ("year", "class")}, + want=[ + ( + {"id": 1, "year": "1", "name": "A"}, + {"id": 1, "year": "1", "class": "A", "cid": 1, "name": "foo"}, + ), + ( + {"id": 1, "year": "1", "name": "A"}, + {"id": 2, "year": "1", "class": "A", "cid": 1, "name": "bar"}, + ), + ( + {"id": 2, "year": "1", "name": "B"}, + {"id": 3, "year": "1", "class": "B", "cid": 2, "name": "boo"}, + ), + ( + {"id": 3, "year": "1", "name": "C"}, + {"id": 4, "year": "1", "class": "C", "cid": 3, "name": "yoo"}, + ), + ], + ), + C( + msg="inner join with multi keys2", + args=[data.students, data.classes], + kwargs={"left_on": ("year", "class"), "right_on": ("year", "name")}, + want=[ + ( + {"id": 1, "year": "1", "class": "A", "cid": 1, "name": "foo"}, + {"id": 1, "year": "1", "name": "A"}, + ), + ( + {"id": 2, "year": "1", "class": "A", "cid": 1, "name": "bar"}, + {"id": 1, "year": "1", "name": "A"}, + ), + ( + {"id": 3, "year": "1", "class": "B", "cid": 2, "name": "boo"}, + {"id": 2, "year": "1", "name": "B"}, + ), + ( + {"id": 4, "year": "1", "class": "C", "cid": 3, "name": "yoo"}, + {"id": 3, "year": "1", "name": "C"}, + ), + ], + ), + ] + for c in cases: + with self.subTest(msg=c.msg, kwargs=c.kwargs): + got = self._callFUT(*c.args, **c.kwargs) + + self.assertTrue( + got == c.want, msg=_DifferenceReportText(got=got, want=c.want) + ) + + self.assertTrue(data.students == copied.students, "not modified") + self.assertTrue(data.classes == copied.classes, "not modified") + class _DifferenceReportText: def __init__(self, *, got, want):