-
Notifications
You must be signed in to change notification settings - Fork 2
Checks abbreviations press and reports #759
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
7316257
c89d506
8394b5a
975b4d2
280d9cd
7fe11be
edd07ce
d696932
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| import re | ||
| from pymorphy2 import MorphAnalyzer | ||
| morph = MorphAnalyzer() | ||
|
|
||
|
|
||
| def get_unexplained_abbrev(text): | ||
| abbreviations = find_abbreviations(text) | ||
|
|
||
| if not abbreviations: | ||
| return False, None | ||
|
|
||
| unexplained_abbr = [] | ||
| for abbr in abbreviations: | ||
| if not is_abbreviation_explained(abbr, text): | ||
| unexplained_abbr.append(abbr) | ||
|
|
||
| return True, unexplained_abbr | ||
|
|
||
|
|
||
|
|
||
|
|
||
| def find_abbreviations(text: str): | ||
| pattern = r'\b[А-ЯA-Z]{2,5}\b' | ||
| abbreviations = re.findall(pattern, text) | ||
|
|
||
| common_abbr = { | ||
| 'СССР', 'РФ', 'США', 'ВКР', 'ИТ', 'ПО', 'ООО', 'ЗАО', 'ОАО', 'HTML', 'CSS', | ||
| 'JS', 'ЛЭТИ', 'МОЕВМ', 'ЭВМ', 'ГОСТ', 'DVD' | ||
|
|
||
| 'SSD', 'PC', 'HDD', | ||
| 'AX', 'BX', 'CX', 'DX', 'SI', 'DI', 'BP', 'SP', | ||
| 'AH', 'AL', 'BH', 'BL', 'CH', 'CL', 'DH', 'DL', | ||
| 'CS', 'DS', 'ES', 'SS', 'FS', 'GS', | ||
| 'IP', 'EIP', 'RIP', | ||
| 'CF', 'PF', 'AF', 'ZF', 'SF', 'TF', 'IF', 'DF', 'OF', | ||
| 'EAX', 'EBX', 'ECX', 'EDX', 'ESI', 'EDI', 'EBP', 'ESP', | ||
| 'RAX', 'RBX', 'RCX', 'RDX', 'RSI', 'RDI', 'RBP', 'RSP', | ||
| 'DOS', 'OS', 'BIOS', 'UEFI', 'MBR', 'GPT', | ||
| 'ASCII', 'UTF', 'UNICODE', 'ANSI', | ||
| 'ЭВМ', 'МОЭВМ', | ||
| 'CPU', 'GPU', 'APU', 'RAM', 'ROM', 'PROM', 'EPROM', 'EEPROM', | ||
| 'USB', 'SATA', 'PCI', 'PCIe', 'AGP', 'ISA', 'VGA', 'HDMI', 'DP', | ||
| 'LAN', 'WAN', 'WLAN', 'VPN', 'ISP', 'DNS', 'DHCP', 'TCP', 'UDP', 'IP', | ||
| 'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', | ||
| 'API', 'GUI', 'CLI', 'IDE', 'SDK', 'SQL', 'NoSQL', 'XML', 'JSON', 'YAML', | ||
| 'MAC', 'IBM', 'ГОСТ' | ||
| } | ||
| filtered_abbr = [abbr for abbr in abbreviations if abbr not in common_abbr and morph.parse(abbr.lower())[0].score != 0] | ||
|
|
||
| return list(set(filtered_abbr)) | ||
|
Comment on lines
+48
to
+50
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Можно сразу формировать set в конструкции вида |
||
|
|
||
|
|
||
| def is_abbreviation_explained(abbr: str, text: str) -> bool: | ||
| patterns = [ | ||
| rf'{abbr}\s*\(([^)]+)\)', # АААА (расшифровка) | ||
| rf'\(([^)]+)\)\s*{abbr}', # (расшифровка) АААА | ||
| rf'{abbr}\s*[—\-]\s*([^.,;!?]+)', # АААА — расшифровка | ||
| rf'{abbr}\s*-\s*([^.,;!?]+)', # АААА - расшифровка | ||
| rf'([^.,;!?]+)\s*[—\-]\s*{abbr}', # расшифровка — АААА | ||
| rf'([^.,;!?]+)\s*-\s*{abbr}' # расшифровка - АААА | ||
| ] | ||
|
|
||
|
|
||
| for pattern in patterns: | ||
| match = re.search(pattern, text, re.IGNORECASE) | ||
| if match and correctly_explained(abbr, match.group(1)): | ||
| return True | ||
|
|
||
| return False | ||
|
|
||
| def correctly_explained(abbr, explan): | ||
| words = explan.split() | ||
|
|
||
| first_letter = "" | ||
| for word in words: | ||
| first_letter += word[0].upper() | ||
|
|
||
| if(first_letter == abbr[len(first_letter)]): | ||
| return True | ||
| return False | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| import re | ||
| from ..base_check import BasePresCriterion, answer | ||
| from ..check_abbreviations import get_unexplained_abbrev | ||
|
|
||
|
|
||
| class PresAbbreviationsCheck(BasePresCriterion): | ||
| label = "Проверка расшифровки аббревиатур в презентации" | ||
| description = "Все аббревиатуры должны быть расшифрованы при первом использовании" | ||
| id = 'abbreviations_check_pres' | ||
|
Comment on lines
+6
to
+9
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Придерживайтесь шаблона именований
|
||
|
|
||
| def __init__(self, file_info): | ||
| super().__init__(file_info) | ||
|
|
||
| def check(self): | ||
| try: | ||
| slides_text = self.file.get_text_from_slides() | ||
|
|
||
| if not slides_text: | ||
| return answer(False, "Не удалось получить текст презентации") | ||
|
|
||
| full_text = " ".join(slides_text) | ||
|
|
||
| abbr_is_finding, unexplained_abbr = get_unexplained_abbrev(text=full_text) | ||
|
|
||
| if not abbr_is_finding: | ||
| return answer(True, "Аббревиатуры не найдены в презентации") | ||
|
|
||
| if not unexplained_abbr: | ||
| return answer(True, "Все аббревиатуры правильно расшифрованы") | ||
|
|
||
| unexplained_abbr_with_slides = {} | ||
|
|
||
| for slide_num, slide_text in enumerate(slides_text, 1): | ||
| for abbr in unexplained_abbr: | ||
| if abbr in slide_text and abbr not in unexplained_abbr_with_slides: | ||
| unexplained_abbr_with_slides[abbr] = slide_num | ||
|
|
||
| result_str = "Найдены нерасшифрованные аббревиатуры при первом использовании:<br>" | ||
| slide_links = self.format_page_link(list(unexplained_abbr_with_slides.values())) | ||
| for index_links, abbr in enumerate(unexplained_abbr_with_slides): | ||
| result_str += f"- {abbr} на слайде {slide_links[index_links]}<br>" | ||
|
|
||
| result_str += "<br>Каждая аббревиатура должна быть расшифрована при первом использовании в презентации.<br>" | ||
| result_str += "Расшифровка должны быть по первыми буквам, например, МВД - Министерство внутренних дел.<br>" | ||
|
|
||
| return answer(False, result_str) | ||
|
|
||
| except Exception as e: | ||
| return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}") | ||
|
|
||
| def _find_abbreviation_slides(self, abbr: str, slides_text: list) -> list: | ||
| found_slides = [] | ||
|
|
||
| for slide_num, slide_text in enumerate(slides_text, 1): | ||
| pattern = rf'\b{re.escape(abbr)}\b' | ||
| if re.search(pattern, slide_text, re.IGNORECASE): | ||
| found_slides.append(slide_num) | ||
|
|
||
| return found_slides | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| from ..base_check import BaseReportCriterion, answer | ||
| from ..check_abbreviations import get_unexplained_abbrev | ||
|
|
||
| class AbbreviationsCheckPres(BaseReportCriterion): | ||
| label = "Проверка расшифровки аббревиатур" | ||
| description = "Все аббревиатуры должны быть расшифрованы при первом использовании" | ||
| id = 'abbreviations_check' | ||
|
|
||
| def __init__(self, file_info): | ||
| super().__init__(file_info) | ||
|
|
||
|
|
||
| def check(self): | ||
| try: | ||
| text = self._get_document_text() | ||
|
|
||
| if not text: | ||
| return answer(False, "Не удалось получить текст документа") | ||
|
|
||
| abbr_is_finding, unexplained_abbr = get_unexplained_abbrev(text=text) | ||
|
|
||
| if not abbr_is_finding: | ||
| return answer(True, "Аббревиатуры не найдены в документе") | ||
|
|
||
| if not unexplained_abbr: | ||
| return answer(True, "Все аббревиатуры правильно расшифрованы") | ||
|
|
||
| unexplained_abbr_with_page = {} | ||
|
|
||
| for page_num in range(1, self.file.page_counter() + 1): | ||
| text_on_page = self.file.pdf_file.text_on_page[page_num] | ||
|
|
||
| for abbr in unexplained_abbr: | ||
| if abbr in text_on_page and abbr not in unexplained_abbr_with_page: | ||
| unexplained_abbr_with_page[abbr] = page_num | ||
|
|
||
|
|
||
| result_str = "Найдены нерасшифрованные аббревиатуры при первом использовании:<br>" | ||
| page_links = self.format_page_link(list(unexplained_abbr_with_page.values())) | ||
| for index_links, abbr in enumerate(unexplained_abbr_with_page): | ||
| result_str += f"- {abbr} на странице {page_links[index_links]}<br>" | ||
| result_str += "Каждая аббревиатура должна быть расшифрована при первом использовании в тексте.<br>" | ||
| result_str += "Расшифровка должны быть по первыми буквам, например, МВД - Министерство внутренних дел.<br>" | ||
|
|
||
| return answer(False, result_str) | ||
|
Comment on lines
+15
to
+45
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. С учетом, что данный код 1 в 1 дублируется в обоих критериях (за исключением строк с указанием документа/презентации и получения данных), его стоит вынести в отдельную функцию/модуль |
||
|
|
||
| except Exception as e: | ||
| return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}") | ||
|
|
||
|
|
||
|
|
||
| def _get_document_text(self): | ||
|
|
||
| if hasattr(self.file, 'pdf_file'): | ||
| page_texts = self.file.pdf_file.get_text_on_page() | ||
| return " ".join(page_texts.values()) | ||
| elif hasattr(self.file, 'paragraphs'): | ||
| text_parts = [] | ||
| for paragraph in self.file.paragraphs: | ||
| text = paragraph.to_string() | ||
| if '\n' in text: | ||
| text = text.split('\n')[1] | ||
| text_parts.append(text) | ||
| return "\n".join(text_parts) | ||
| return None | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Подтяните изменения из dev и перейдите на pymorphy3