cambridge-cli/cambridge.py at master · Zaphoood/cambridge-cli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python3
from typing import Iterable, List, Optional, Union, cast
import requests
import sys
from bs4 import BeautifulSoup, Tag
from dataclasses import dataclass
import logging
from format import roman, prepend, prepend_first_line, wrap

BASE_URL = "https://dictionary.cambridge.org/dictionary/english/"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0",
}


class WordNotFound(ValueError):
    pass


@dataclass
class WordDefinition:
    guideword: Optional[str]
    explanation: str

    def __str__(self) -> str:
        out = "" if self.guideword is None else (self.guideword + ": ")
        out += wrap(self.explanation)

        return out


@dataclass
class WordInfo:
    word: str
    pos: Optional[str]
    definitions: List[WordDefinition]
    pronunciation_uk: Optional[str]
    pronunciation_us: Optional[str]

    def __str__(self) -> str:
        out = self.word
        if self.pos is not None:
            out += f" ({self.pos})"

        if None not in (self.pronunciation_uk, self.pronunciation_us):
            out += "\n\n\t"
        if self.pronunciation_uk:
            out += f"UK: /{self.pronunciation_uk}/"
            if self.pronunciation_us:
                out += ", "
        if self.pronunciation_us:
            out += f"US: /{self.pronunciation_us}/"

        if len(self.definitions) > 0:
            out += "\n"
        if len(self.definitions) == 1:
            out += "\n" + prepend("\t", str(self.definitions[0]))
        elif len(self.definitions) > 1:
            for i, definition in enumerate(self.definitions):
                definition_with_numeral = prepend_first_line(
                    roman(i + 1).lower().rjust(5) + ") ", str(definition)
                )
                out += "\n" + prepend("\t", definition_with_numeral)

        return out


def get_page_for_word(word: str) -> str:
    assert len(word) > 0
    url = BASE_URL + word

    logging.info(f"Getting response from: {url}")

    response = requests.get(url, headers=HEADERS, allow_redirects=False)
    if response.status_code == 302 and response.headers.get("Location") == BASE_URL:
        raise WordNotFound(f"Could not find word '{word}'")
    if response.status_code != 200:
        print(f"ERROR: Could not get response")
        if word != word.lower():
            print(f"Trying '{word.lower()}' (lowercase)")
            return get_page_for_word(word.lower())
        sys.exit(1)

    return response.text


def parse_info(page_src: str) -> Iterable[WordInfo]:
    logging.info("Parsing page...")
    soup = BeautifulSoup(page_src, "html.parser")

    dictionary = select_first(soup, "div.pr.dictionary")
    if dictionary is None:
        logging.fatal("Dictionary element found")
        return []

    entries = dictionary.select("div.pr.entry-body__el")

    word_infos = [get_word_info_for_entry(entry) for entry in entries]
    return cast(Iterable[WordInfo], filter(lambda w: w is not None, word_infos))


def get_word_info_for_entry(entry: Union[BeautifulSoup, Tag]) -> Optional[WordInfo]:
    word = select_first(entry, "span.hw.dhw")
    if word is None:
        return None

    pronunciation_uk = None
    pronunciation_us = None
    pos = None
    definitions = []

    if (uk_pron_elem := select_first(entry, "span.uk.dpron-i span.ipa")) is not None:
        pronunciation_uk = uk_pron_elem.text

    if (us_pron_elem := select_first(entry, "span.us.dpron-i span.ipa")) is not None:
        pronunciation_us = us_pron_elem.text

    if (pos := select_first(entry, "span.pos.dpos")) is not None:
        pos = pos.text

    dsenses = entry.select("div.pr.dsense")
    if len(dsenses) > 0:
        for dsense in dsenses:
            guideword = select_first(dsense, "span.guideword.dsense_gw span")
            definition = select_first(dsense, "div.def.ddef_d.db")
            if definition is None:
                continue

            definitions.append(
                WordDefinition(
                    guideword.text if guideword is not None else None,
                    definition_get_inner(definition),
                )
            )

    word_info = WordInfo(
        word=word.text,
        pos=pos,
        definitions=definitions,
        pronunciation_uk=pronunciation_uk,
        pronunciation_us=pronunciation_us,
    )

    return word_info


def definition_get_inner(definition: Tag) -> str:
    joined = "".join(definition.strings).strip()
    if joined.endswith(":"):
        joined = joined[:-1]
    return joined


def select_first(soup: Union[BeautifulSoup, Tag], selector: str) -> Optional[Tag]:
    matches = soup.select(selector)
    if len(matches) < 1:
        logging.warning(f"No matches found for selector '{selector}'")
        return None

    if len(matches) > 1:
        logging.warning(
            f"Multiple matches found for selector '{selector}'; will use first match"
        )

    return matches[0]


def main():
    logging.basicConfig(level=logging.ERROR)
    if len(sys.argv) < 2:
        print(f"USAGE: {sys.argv[0]} WORD")
        sys.exit(1)

    word = sys.argv[1]
    try:
        page_src = get_page_for_word(word)
    except WordNotFound as e:
        print(f"ERROR: {e}")
        sys.exit(1)

    word_infos = list(parse_info(page_src))

    if word_infos is None or len(word_infos) == 0:
        print("Couldn't get info for word")
        sys.exit(1)

    if len(word_infos) == 1:
        print(word_infos[0])
    else:
        for i, word_info in enumerate(word_infos):
            print(f"{i + 1}. {word_info}\n")


if __name__ == "__main__":
    main()