From ae1111343d598791c29f3dae781e499f01d41828 Mon Sep 17 00:00:00 2001 From: Arvin Rolos Date: Wed, 7 Jan 2026 14:59:41 -0800 Subject: [PATCH 1/8] feat: Add SCORM to DOCX --- api/apps.py | 13 + api/scorm/XmlReader.py | 1451 ++++++++++++++++++++++++++++++++++++++++ api/serializers.py | 60 +- api/urls.py | 2 + api/views.py | 203 +++++- requirements.txt | 2 +- 6 files changed, 1721 insertions(+), 10 deletions(-) create mode 100644 api/scorm/XmlReader.py diff --git a/api/apps.py b/api/apps.py index 2a93f58..48d213e 100644 --- a/api/apps.py +++ b/api/apps.py @@ -1,5 +1,6 @@ from django.apps import AppConfig from django.conf import settings +from django.db import connection import sys import logging logger = logging.getLogger(__name__) @@ -18,6 +19,18 @@ def ready(self): else: logger.info("qconapi has started") + # Ensure database connection is ready before accessing the database + # This prevents the RuntimeWarning about accessing database during app initialization + try: + connection.ensure_connection() + except Exception: + # Database not ready yet, skip initialization + return + + # Skip database operations during migrations + if 'migrate' in sys.argv or 'makemigrations' in sys.argv: + return + from django.contrib.auth.models import User if not User.objects.filter(username=settings.ADMIN_USERNAME).exists(): User.objects.create_superuser( diff --git a/api/scorm/XmlReader.py b/api/scorm/XmlReader.py new file mode 100644 index 0000000..79bb591 --- /dev/null +++ b/api/scorm/XmlReader.py @@ -0,0 +1,1451 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +import os +import xml.etree.cElementTree as ET +from zipfile import ZipFile +from os import path, makedirs +from django.conf import settings +from bs4 import BeautifulSoup +import re +from api.models import ( + QuestionLibrary, Section, Question, + MultipleChoice, MultipleChoiceAnswer, + TrueFalse, Fib, MultipleSelect, MultipleSelectAnswer, + Matching, MatchingChoice, MatchingAnswer, + Ordering, WrittenResponse +) +from api.models import ( + QuestionLibrary, Section, Question, + MultipleChoice, MultipleChoiceAnswer, + TrueFalse, Fib, MultipleSelect, MultipleSelectAnswer, + Matching, MatchingChoice, MatchingAnswer, + Ordering, WrittenResponse +) + + +class XmlReader: + """ + Reads and parses SCORM XML files (questiondb.xml, imsmanifest.xml) + and extracts data into Django models. + This class mirrors the structure of XmlWriter but in reverse. + """ + + def __init__(self, scorm_zip_path, extract_to_path=None): + """ + Initialize XmlReader with a SCORM ZIP file path. + + Args: + scorm_zip_path: Path to the SCORM ZIP file + extract_to_path: Optional path to extract ZIP contents (defaults to temp directory) + """ + self.scorm_zip_path = scorm_zip_path + self.extract_to_path = extract_to_path + self.questiondb_xml = None + self.imsmanifest_xml = None + self.extracted_path = None + + # Extract ZIP file + self._extract_zip() + + # Parse XML files + self._parse_xml_files() + + def _extract_zip(self): + """Extract SCORM ZIP file to temporary directory.""" + if not path.exists(self.scorm_zip_path): + raise FileNotFoundError(f"SCORM ZIP file not found: {self.scorm_zip_path}") + + # Create extraction directory if not provided + if self.extract_to_path is None: + # Use a temp directory based on the ZIP filename + zip_basename = path.splitext(path.basename(self.scorm_zip_path))[0] + self.extract_to_path = path.join(settings.MEDIA_ROOT, f"scorm_extract_{zip_basename}") + + # Create directory if it doesn't exist + if not path.exists(self.extract_to_path): + makedirs(self.extract_to_path) + + # Extract ZIP file + with ZipFile(self.scorm_zip_path, 'r') as zip_ref: + zip_ref.extractall(self.extract_to_path) + + self.extracted_path = self.extract_to_path + + def _parse_xml_files(self): + """Parse questiondb.xml and imsmanifest.xml from extracted files.""" + questiondb_path = path.join(self.extracted_path, "questiondb.xml") + imsmanifest_path = path.join(self.extracted_path, "imsmanifest.xml") + + if not path.exists(questiondb_path): + raise FileNotFoundError(f"questiondb.xml not found in SCORM package: {questiondb_path}") + + if not path.exists(imsmanifest_path): + raise FileNotFoundError(f"imsmanifest.xml not found in SCORM package: {imsmanifest_path}") + + # Parse XML files + self.questiondb_xml = ET.parse(questiondb_path) + self.imsmanifest_xml = ET.parse(imsmanifest_path) + + def parse_manifest(self): + """ + Parse imsmanifest.xml and extract metadata. + + Returns: + dict: Dictionary containing manifest metadata + """ + root = self.imsmanifest_xml.getroot() + + manifest_data = { + 'identifier': root.get('identifier', ''), + 'resources': [] + } + + # Parse resources + resources_el = root.find('resources') + if resources_el is not None: + for resource_el in resources_el.findall('resource'): + resource_data = { + 'identifier': resource_el.get('identifier', ''), + 'type': resource_el.get('type', ''), + 'material_type': resource_el.get('{http://desire2learn.com/xsd/d2lcp_v2p0}material_type', ''), + 'href': resource_el.get('href', ''), + 'link_target': resource_el.get('{http://desire2learn.com/xsd/d2lcp_v2p0}link_target', ''), + 'title': resource_el.get('title', '') + } + manifest_data['resources'].append(resource_data) + + return manifest_data + + def parse_questiondb(self): + """ + Parse questiondb.xml and extract question library structure. + + Returns: + dict: Dictionary containing question library data structure + """ + root = self.questiondb_xml.getroot() + + # Find objectbank element + objectbank_el = root.find('objectbank') + if objectbank_el is None: + raise ValueError("objectbank element not found in questiondb.xml") + + question_library_data = { + 'ident': objectbank_el.get('ident', ''), + 'sections': [] + } + + # Parse base section (root section) + base_sections = objectbank_el.findall('section') + for section_el in base_sections: + section_data = self._parse_section(section_el) + question_library_data['sections'].append(section_data) + + return question_library_data + + def _parse_section(self, section_el): + """ + Parse a section element and extract section data. + + Args: + section_el: XML element representing a section + + Returns: + dict: Dictionary containing section data + """ + section_data = { + 'ident': section_el.get('ident', ''), + 'title': section_el.get('title', ''), + 'shuffle': False, + 'is_title_displayed': True, + 'is_text_displayed': False, + 'text': '', + 'questions': [] + } + + # Check for shuffle (selection_ordering with Random order) + selection_ordering = section_el.find('selection_ordering') + if selection_ordering is not None: + order_el = selection_ordering.find('order') + if order_el is not None and order_el.get('order_type') == 'Random': + section_data['shuffle'] = True + + # Parse presentation material (section text) + presentation_material = section_el.find('presentation_material') + if presentation_material is not None: + text = self._extract_text_from_material(presentation_material) + section_data['text'] = text + + # Parse sectionproc_extension + sectionproc = section_el.find('sectionproc_extension') + if sectionproc is not None: + display_name = sectionproc.find('{http://desire2learn.com/xsd/d2lcp_v2p0}display_section_name') + if display_name is not None: + section_data['is_title_displayed'] = display_name.text.lower() == 'yes' + + type_display = sectionproc.find('{http://desire2learn.com/xsd/d2lcp_v2p0}type_display_section') + if type_display is not None: + section_data['is_text_displayed'] = type_display.text == '1' + + # Parse nested sections + nested_sections = section_el.findall('section') + for nested_section_el in nested_sections: + nested_section_data = self._parse_section(nested_section_el) + section_data['sections'] = section_data.get('sections', []) + section_data['sections'].append(nested_section_data) + + # Parse questions (items) + items = section_el.findall('item') + for item_el in items: + question_data = self._parse_question(item_el) + section_data['questions'].append(question_data) + + return section_data + + def _parse_question(self, item_el): + """ + Parse a question (item) element and extract question data. + + Args: + item_el: XML element representing a question item + + Returns: + dict: Dictionary containing question data + """ + question_data = { + 'ident': item_el.get('ident', ''), + 'label': item_el.get('label', ''), + 'title': item_el.get('title', ''), + 'question_type': None, + 'points': 1.0, + 'text': '', + 'hint': None, + 'feedback': None, + 'question_specific_data': {} + } + + # Parse itemmetadata to get question type and points + itemmetadata = item_el.find('itemmetadata') + if itemmetadata is not None: + qtidata = itemmetadata.find('qtimetadata') + if qtidata is not None: + for field in qtidata.findall('qti_metadatafield'): + fieldlabel = field.find('fieldlabel') + fieldentry = field.find('fieldentry') + if fieldlabel is not None and fieldentry is not None: + if fieldlabel.text == 'qmd_questiontype': + question_data['question_type'] = fieldentry.text + elif fieldlabel.text == 'qmd_weighting': + try: + question_data['points'] = float(fieldentry.text) + except (ValueError, TypeError): + pass + + # Parse presentation to get question text + presentation = item_el.find('presentation') + if presentation is not None: + question_text = self._extract_question_text(presentation) + question_data['text'] = question_text + + # Parse hint + hint_el = item_el.find('hint') + if hint_el is not None: + question_data['hint'] = self._extract_text_from_hint(hint_el) + + # Parse general feedback + feedback_els = item_el.findall('itemfeedback') + for feedback_el in feedback_els: + # General feedback typically has ident matching the question label + if feedback_el.get('ident') == question_data['label']: + question_data['feedback'] = self._extract_text_from_feedback(feedback_el) + + # Parse question-specific data based on type + question_type = question_data['question_type'] + if question_type: + if question_type == 'Multiple Choice': + question_data['question_specific_data'] = self._parse_multiple_choice(item_el, question_data['label']) + question_data['question_type_code'] = 'MC' + elif question_type == 'True/False': + question_data['question_specific_data'] = self._parse_true_false(item_el, question_data['label']) + question_data['question_type_code'] = 'TF' + elif question_type == 'Fill in the Blanks': + question_data['question_specific_data'] = self._parse_fill_in_the_blanks(item_el, question_data['label']) + question_data['question_type_code'] = 'FIB' + elif question_type == 'Multi-Select': + question_data['question_specific_data'] = self._parse_multi_select(item_el, question_data['label']) + question_data['question_type_code'] = 'MS' + elif question_type == 'Matching': + question_data['question_specific_data'] = self._parse_matching(item_el, question_data['label']) + question_data['question_type_code'] = 'MAT' + elif question_type == 'Ordering': + question_data['question_specific_data'] = self._parse_ordering(item_el, question_data['label']) + question_data['question_type_code'] = 'ORD' + elif question_type == 'Long Answer': + question_data['question_specific_data'] = self._parse_written_response(item_el, question_data['label']) + question_data['question_type_code'] = 'WR' + + return question_data + + def _extract_text_from_material(self, material_el): + """Extract text content from material element, handling CDATA.""" + text_parts = [] + + # Navigate through flow_mat -> material -> mattext + flow_mat = material_el.find('flow_mat') + if flow_mat is not None: + materials = flow_mat.findall('.//material') + for material in materials: + mattext = material.find('mattext') + if mattext is not None: + # Get text content (handles CDATA) + text = mattext.text if mattext.text else '' + # Also check for CDATA in tail + if mattext.tail: + text += mattext.tail + text_parts.append(text) + + return ''.join(text_parts).strip() + + def _extract_question_text(self, presentation_el): + """Extract question text from presentation element.""" + text_parts = [] + + flow = presentation_el.find('flow') + if flow is not None: + # Find first material element (question text) + material = flow.find('material') + if material is not None: + mattext = material.find('mattext') + if mattext is not None: + text = mattext.text if mattext.text else '' + if mattext.tail: + text += mattext.tail + text_parts.append(text) + + return ''.join(text_parts).strip() + + def _extract_text_from_hint(self, hint_el): + """Extract text from hint element.""" + hintmaterial = hint_el.find('hintmaterial') + if hintmaterial is not None: + return self._extract_text_from_material(hintmaterial) + return None + + def _extract_text_from_feedback(self, feedback_el): + """Extract text from feedback element.""" + material = feedback_el.find('material') + if material is not None: + mattext = material.find('mattext') + if mattext is not None: + return mattext.text if mattext.text else '' + return None + + def _parse_multiple_choice(self, item_el, question_ident): + """ + Parse multiple choice question data. + Mirrors generate_multiple_choice() from XmlWriter. + """ + mc_data = { + 'randomize': False, + 'enumeration': 4, + 'answers': [] + } + + presentation = item_el.find('presentation') + if presentation is None: + return mc_data + + flow = presentation.find('flow') + if flow is None: + return mc_data + + # Parse response_extension for enumeration + response_ext = flow.find('response_extension') + if response_ext is not None: + enumeration_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}enumeration') + if enumeration_el is not None and enumeration_el.text: + try: + mc_data['enumeration'] = int(enumeration_el.text) + except (ValueError, TypeError): + pass + + # Parse response_lid for answers + response_lid = flow.find('response_lid') + if response_lid is not None: + # Check shuffle setting + render_choice = response_lid.find('render_choice') + if render_choice is not None: + mc_data['randomize'] = render_choice.get('shuffle', 'no').lower() == 'yes' + + # Parse answer options + question_lid = response_lid.get('ident', '') + answer_index = 1 + for flow_label in response_lid.findall('.//flow_label'): + response_label = flow_label.find('response_label') + if response_label is not None: + answer_ident = response_label.get('ident', '') + # Extract answer text + mattext = response_label.find('.//mattext') + answer_text = '' + if mattext is not None: + answer_text = mattext.text if mattext.text else '' + + # Find weight from resprocessing + weight = 0.0 + answer_feedback = None + resprocessing = item_el.find('resprocessing') + if resprocessing is not None: + for respcondition in resprocessing.findall('respcondition'): + conditionvar = respcondition.find('conditionvar') + if conditionvar is not None: + varequal = conditionvar.find('varequal') + if varequal is not None and varequal.get('respident') == question_lid: + if varequal.text == answer_ident: + setvar = respcondition.find('setvar') + if setvar is not None: + try: + weight = float(setvar.text) + except (ValueError, TypeError): + pass + + # Find answer-specific feedback + displayfeedback = respcondition.find('displayfeedback') + if displayfeedback is not None: + feedback_ident = displayfeedback.get('linkrefid', '') + feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") + if feedback_el is not None: + answer_feedback = self._extract_text_from_feedback(feedback_el) + + mc_data['answers'].append({ + 'answer': answer_text, + 'weight': weight, + 'answer_feedback': answer_feedback, + 'order': answer_index + }) + answer_index += 1 + + return mc_data + + def _parse_true_false(self, item_el, question_ident): + """ + Parse true/false question data. + Mirrors generate_true_false() from XmlWriter. + """ + tf_data = { + 'true_weight': 0.0, + 'true_feedback': None, + 'false_weight': 0.0, + 'false_feedback': None, + 'enumeration': 4 + } + + presentation = item_el.find('presentation') + if presentation is None: + return tf_data + + flow = presentation.find('flow') + if flow is None: + return tf_data + + # Parse response_extension for enumeration + response_ext = flow.find('response_extension') + if response_ext is not None: + enumeration_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}enumeration') + if enumeration_el is not None and enumeration_el.text: + try: + tf_data['enumeration'] = int(enumeration_el.text) + except (ValueError, TypeError): + pass + + # Parse response_lid for True/False options + response_lid = flow.find('response_lid') + if response_lid is not None: + question_lid = response_lid.get('ident', '') + + # Get the order of True/False options from response labels + # First response_label is True, second is False + render_choice = response_lid.find('render_choice') + true_ident = None + false_ident = None + if render_choice is not None: + response_labels = render_choice.findall('.//response_label') + if len(response_labels) >= 1: + true_ident = response_labels[0].get('ident', '') + if len(response_labels) >= 2: + false_ident = response_labels[1].get('ident', '') + + resprocessing = item_el.find('resprocessing') + + if resprocessing is not None: + for respcondition in resprocessing.findall('respcondition'): + conditionvar = respcondition.find('conditionvar') + if conditionvar is not None: + varequal = conditionvar.find('varequal') + if varequal is not None and varequal.get('respident') == question_lid: + answer_ident = varequal.text + + # Match answer_ident to determine if it's True or False + if true_ident and answer_ident == true_ident: + setvar = respcondition.find('setvar') + if setvar is not None: + try: + tf_data['true_weight'] = float(setvar.text) + except (ValueError, TypeError): + pass + + # Get feedback + displayfeedback = respcondition.find('displayfeedback') + if displayfeedback is not None: + feedback_ident = displayfeedback.get('linkrefid', '') + feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") + if feedback_el is not None: + tf_data['true_feedback'] = self._extract_text_from_feedback(feedback_el) + + elif false_ident and answer_ident == false_ident: + setvar = respcondition.find('setvar') + if setvar is not None: + try: + tf_data['false_weight'] = float(setvar.text) + except (ValueError, TypeError): + pass + + # Get feedback + displayfeedback = respcondition.find('displayfeedback') + if displayfeedback is not None: + feedback_ident = displayfeedback.get('linkrefid', '') + feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") + if feedback_el is not None: + tf_data['false_feedback'] = self._extract_text_from_feedback(feedback_el) + + return tf_data + + def _parse_fill_in_the_blanks(self, item_el, question_ident): + """ + Parse fill in the blanks question data. + Mirrors generate_fill_in_the_blanks() from XmlWriter. + """ + fib_data = { + 'fibs': [] # List of fibquestion and fibanswer items in order + } + + presentation = item_el.find('presentation') + if presentation is None: + return fib_data + + flow = presentation.find('flow') + if flow is None: + return fib_data + + # Parse flow elements in order (alternating fibquestion and fibanswer) + idx = 1 + for child in flow: + if child.tag == 'material': + # This is a fibquestion (text part) + mattext = child.find('mattext') + text = '' + if mattext is not None: + text = mattext.text if mattext.text else '' + + fib_data['fibs'].append({ + 'type': 'fibquestion', + 'text': text, + 'order': idx + }) + + elif child.tag == 'response_str': + # This is a fibanswer (blank) + question_ans = question_ident + str(idx) + "_ANS" + + # Find answers from resprocessing + answers = [] + resprocessing = item_el.find('resprocessing') + if resprocessing is not None: + for respcondition in resprocessing.findall('respcondition'): + conditionvar = respcondition.find('conditionvar') + if conditionvar is not None: + varequal = conditionvar.find('varequal') + if varequal is not None and varequal.get('respident') == question_ans: + answer_text = varequal.text if varequal.text else '' + if answer_text: + answers.append(answer_text) + + fib_data['fibs'].append({ + 'type': 'fibanswer', + 'text': ','.join(answers) if answers else '', + 'order': idx, + 'size': 30 # Default from XmlWriter + }) + idx += 1 + + return fib_data + + def _parse_multi_select(self, item_el, question_ident): + """ + Parse multi-select question data. + Mirrors generate_multi_select() from XmlWriter. + """ + ms_data = { + 'randomize': False, + 'enumeration': 4, + 'style': 2, + 'grading_type': 2, + 'answers': [] + } + + presentation = item_el.find('presentation') + if presentation is None: + return ms_data + + flow = presentation.find('flow') + if flow is None: + return ms_data + + # Parse response_extension + response_ext = flow.find('response_extension') + if response_ext is not None: + enumeration_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}enumeration') + if enumeration_el is not None and enumeration_el.text: + try: + ms_data['enumeration'] = int(enumeration_el.text) + except (ValueError, TypeError): + pass + + grading_type_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}grading_type') + if grading_type_el is not None and grading_type_el.text: + try: + ms_data['grading_type'] = int(grading_type_el.text) + except (ValueError, TypeError): + pass + + # Parse response_lid + response_lid = flow.find('response_lid') + if response_lid is not None: + question_lid = response_lid.get('ident', '') + + # Check shuffle + render_choice = response_lid.find('render_choice') + if render_choice is not None: + ms_data['randomize'] = render_choice.get('shuffle', 'no').lower() == 'yes' + + # Parse answers + answer_index = 1 + for flow_label in response_lid.findall('.//flow_label'): + response_label = flow_label.find('response_label') + if response_label is not None: + answer_ident = response_label.get('ident', '') + + # Extract answer text + mattext = response_label.find('.//mattext') + answer_text = '' + if mattext is not None: + answer_text = mattext.text if mattext.text else '' + + # Determine if correct from resprocessing + is_correct = False + answer_feedback = None + resprocessing = item_el.find('resprocessing') + if resprocessing is not None: + for respcondition in resprocessing.findall('respcondition'): + conditionvar = respcondition.find('conditionvar') + if conditionvar is not None: + varequal = conditionvar.find('varequal') + if varequal is not None and varequal.get('respident') == question_lid: + if varequal.text == answer_ident: + setvar = respcondition.find('setvar') + if setvar is not None: + # If setvar adds to D2L_Correct, it's a correct answer + if setvar.get('varname') == 'D2L_Correct': + is_correct = True + + # Find answer-specific feedback + displayfeedback = respcondition.find('displayfeedback') + if displayfeedback is not None: + feedback_ident = displayfeedback.get('linkrefid', '') + feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") + if feedback_el is not None: + answer_feedback = self._extract_text_from_feedback(feedback_el) + + ms_data['answers'].append({ + 'answer': answer_text, + 'is_correct': is_correct, + 'answer_feedback': answer_feedback, + 'order': answer_index + }) + answer_index += 1 + + return ms_data + + def _parse_matching(self, item_el, question_ident): + """ + Parse matching question data. + Mirrors generate_matching() from XmlWriter. + """ + mat_data = { + 'grading_type': 0, + 'choices': [] + } + + presentation = item_el.find('presentation') + if presentation is None: + return mat_data + + flow = presentation.find('flow') + if flow is None: + return mat_data + + # Parse response_extension for grading_type + response_ext = flow.find('response_extension') + if response_ext is not None: + grading_type_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}grading_type') + if grading_type_el is not None and grading_type_el.text: + try: + mat_data['grading_type'] = int(grading_type_el.text) + except (ValueError, TypeError): + pass + + # Collect all unique matching answers first (from render_choice) + matching_answers = {} + answer_index = 1 + question_ident_answer = question_ident + "_A" + + # Find all response_grp elements (one per choice) + response_grps = flow.findall('response_grp') + + for response_grp in response_grps: + choice_ident = response_grp.get('respident', '') + + # Get choice text from material + material = response_grp.find('material') + choice_text = '' + if material is not None: + mattext = material.find('mattext') + if mattext is not None: + choice_text = mattext.text if mattext.text else '' + + # Get the render_choice to find available answers + render_choice = response_grp.find('render_choice') + matching_answer_texts = [] + + if render_choice is not None: + for flow_label in render_choice.findall('.//flow_label'): + response_label = flow_label.find('response_label') + if response_label is not None: + answer_ident = response_label.get('ident', '') + mattext = response_label.find('.//mattext') + if mattext is not None: + answer_text = mattext.text if mattext.text else '' + if answer_text and answer_ident not in matching_answers: + matching_answers[answer_ident] = answer_text + + # Find correct answer from resprocessing + correct_answer_ident = None + resprocessing = item_el.find('resprocessing') + if resprocessing is not None: + for respcondition in resprocessing.findall('respcondition'): + conditionvar = respcondition.find('conditionvar') + if conditionvar is not None: + varequal = conditionvar.find('varequal') + if varequal is not None and varequal.get('respident') == choice_ident: + setvar = respcondition.find('setvar') + if setvar is not None and setvar.get('varname') == 'D2L_Correct': + correct_answer_ident = varequal.text + + # Build matching answers list for this choice + matching_answers_list = [] + if correct_answer_ident and correct_answer_ident in matching_answers: + matching_answers_list.append({ + 'answer_text': matching_answers[correct_answer_ident] + }) + + mat_data['choices'].append({ + 'choice_text': choice_text, + 'matching_answers': matching_answers_list + }) + + return mat_data + + def _parse_ordering(self, item_el, question_ident): + """ + Parse ordering question data. + Mirrors generate_ordering() from XmlWriter. + """ + ord_data = { + 'items': [] + } + + presentation = item_el.find('presentation') + if presentation is None: + return ord_data + + flow = presentation.find('flow') + if flow is None: + return ord_data + + # Find response_grp with rcardinality="Ordered" + response_grp = flow.find('response_grp[@rcardinality="Ordered"]') + if response_grp is None: + return ord_data + + render_choice = response_grp.find('render_choice') + if render_choice is None: + return ord_data + + # Parse ordering items + order_index = 1 + for flow_label in render_choice.findall('.//flow_label'): + response_label = flow_label.find('response_label') + if response_label is not None: + ident_num = response_label.get('ident', '') + + # Extract text + mattext = response_label.find('.//mattext') + text = '' + if mattext is not None: + text = mattext.text if mattext.text else '' + + # Find feedback + ord_feedback = None + question_ident_feedback = question_ident + "_IF" + feedback_ident = question_ident_feedback + str(order_index) + feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") + if feedback_el is not None: + ord_feedback = self._extract_text_from_feedback(feedback_el) + + ord_data['items'].append({ + 'text': text, + 'order': order_index, + 'ord_feedback': ord_feedback + }) + order_index += 1 + + return ord_data + + def _parse_written_response(self, item_el, question_ident): + """ + Parse written response question data. + Mirrors generate_written_response() from XmlWriter. + """ + wr_data = { + 'enable_student_editor': False, + 'initial_text': None, + 'answer_key': '', + 'enable_attachments': False + } + + # Parse response_extension + presentation = item_el.find('presentation') + if presentation is not None: + flow = presentation.find('flow') + if flow is not None: + response_ext = flow.find('response_extension') + if response_ext is not None: + editor_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}has_htmleditor') + if editor_el is not None: + editor_text = editor_el.text if editor_el.text else '' + wr_data['enable_student_editor'] = editor_text.lower() == 'yes' + + # Parse answer_key + answer_key_el = item_el.find('answer_key') + if answer_key_el is not None: + answer_key_mat = answer_key_el.find('answer_key_material') + if answer_key_mat is not None: + mattext = answer_key_mat.find('.//mattext') + if mattext is not None: + wr_data['answer_key'] = mattext.text if mattext.text else '' + + # Parse initial_text (if present) + initial_text_el = item_el.find('initial_text') + if initial_text_el is not None: + initial_text_mat = initial_text_el.find('initial_text_material') + if initial_text_mat is not None: + mattext = initial_text_mat.find('.//mattext') + if mattext is not None: + wr_data['initial_text'] = mattext.text if mattext.text else None + + return wr_data + + def populate_django_models(self, question_library=None): + """ + Populate Django models from parsed SCORM XML data. + + Args: + question_library: Optional existing QuestionLibrary instance to use. + If None, a new one will be created. + + Returns: + QuestionLibrary: The QuestionLibrary instance with all sections and questions + """ + # Parse questiondb to get structure + question_library_data = self.parse_questiondb() + + # Get main title from first section (base section) + main_title = '' + if question_library_data['sections']: + main_title = question_library_data['sections'][0].get('title', '') + + # Use existing QuestionLibrary or create a new one + if question_library is None: + question_library = QuestionLibrary.objects.create( + main_title=main_title, + shuffle=False # Will be set from section data + ) + else: + # Update existing instance with parsed data + question_library.main_title = main_title + question_library.save() + + # Process sections + section_order = 1 + question_index = 1 # Global question index that continues across all sections + for section_data in question_library_data['sections']: + has_nested_sections = len(section_data.get('sections', [])) > 0 + has_direct_questions = len(section_data.get('questions', [])) > 0 + has_text = section_data.get('text', '').strip() != '' + + # If root section has questions or text, create it as the first section (is_main_content=True) + # This section represents the main_title and should be in the sections array + if has_direct_questions or has_text: + # Create the root section as the first section with is_main_content=True + section = Section.objects.create( + question_library=question_library, + is_main_content=True, + order=section_order, + title=section_data.get('title', ''), + is_title_displayed=section_data.get('is_title_displayed', True), + text=section_data.get('text', ''), + is_text_displayed=section_data.get('is_text_displayed', False), + shuffle=section_data.get('shuffle', False) + ) + + # Process questions in this section (continue question_index) + for question_data in section_data.get('questions', []): + question = self._create_question_model(section, question_data, question_index) + question_index += 1 + + # Process nested sections (if any) + for nested_section_data in section_data.get('sections', []): + nested_section = Section.objects.create( + question_library=question_library, + is_main_content=False, + order=section_order + 1, + title=nested_section_data.get('title', ''), + is_title_displayed=nested_section_data.get('is_title_displayed', True), + text=nested_section_data.get('text', ''), + is_text_displayed=nested_section_data.get('is_text_displayed', False), + shuffle=nested_section_data.get('shuffle', False) + ) + + # Process questions in nested section (continue question_index) + for question_data in nested_section_data.get('questions', []): + question = self._create_question_model(nested_section, question_data, question_index) + question_index += 1 + + section_order += 1 + + section_order += 1 + elif has_nested_sections: + # Root section has nested sections but no questions/text - don't create Section for it + # Only process nested sections + for nested_section_data in section_data.get('sections', []): + nested_section = Section.objects.create( + question_library=question_library, + is_main_content=False, + order=section_order, + title=nested_section_data.get('title', ''), + is_title_displayed=nested_section_data.get('is_title_displayed', True), + text=nested_section_data.get('text', ''), + is_text_displayed=nested_section_data.get('is_text_displayed', False), + shuffle=nested_section_data.get('shuffle', False) + ) + + # Process questions in nested section (continue question_index) + for question_data in nested_section_data.get('questions', []): + question = self._create_question_model(nested_section, question_data, question_index) + question_index += 1 + + section_order += 1 + + return question_library + + def _create_question_model(self, section, question_data, index): + """Create a Question model and related question type models from parsed data.""" + question = Question.objects.create( + section=section, + index=index, + title=question_data.get('title', ''), + questiontype=question_data.get('question_type_code', ''), + text=question_data.get('text', ''), + points=question_data.get('points', 1.0), + hint=question_data.get('hint'), + feedback=question_data.get('feedback') + ) + + question_type_code = question_data.get('question_type_code', '') + specific_data = question_data.get('question_specific_data', {}) + + if question_type_code == 'MC': + self._create_multiple_choice_model(question, specific_data) + elif question_type_code == 'TF': + self._create_true_false_model(question, specific_data) + elif question_type_code == 'FIB': + self._create_fib_model(question, specific_data) + elif question_type_code == 'MS': + self._create_multiple_select_model(question, specific_data) + elif question_type_code == 'MAT': + self._create_matching_model(question, specific_data) + elif question_type_code == 'ORD': + self._create_ordering_model(question, specific_data) + elif question_type_code == 'WR': + self._create_written_response_model(question, specific_data) + + return question + + def _create_multiple_choice_model(self, question, mc_data): + """Create MultipleChoice and MultipleChoiceAnswer models.""" + mc = MultipleChoice.objects.create( + question=question, + randomize=mc_data.get('randomize', False), + enumeration=mc_data.get('enumeration', 4) + ) + + for answer_data in mc_data.get('answers', []): + MultipleChoiceAnswer.objects.create( + multiple_choice=mc, + order=answer_data.get('order', 1), + answer=answer_data.get('answer', ''), + answer_feedback=answer_data.get('answer_feedback'), + weight=answer_data.get('weight', 0.0) + ) + + def _create_true_false_model(self, question, tf_data): + """Create TrueFalse model.""" + TrueFalse.objects.create( + question=question, + true_weight=tf_data.get('true_weight', 0.0), + true_feedback=tf_data.get('true_feedback'), + false_weight=tf_data.get('false_weight', 0.0), + false_feedback=tf_data.get('false_feedback'), + enumeration=tf_data.get('enumeration', 4) + ) + + def _create_fib_model(self, question, fib_data): + """Create Fib models for fill in the blanks.""" + for fib_item in fib_data.get('fibs', []): + Fib.objects.create( + question=question, + type=fib_item.get('type', 'fibquestion'), + text=fib_item.get('text', ''), + order=fib_item.get('order', 1), + size=fib_item.get('size') + ) + + def _create_multiple_select_model(self, question, ms_data): + """Create MultipleSelect and MultipleSelectAnswer models.""" + ms = MultipleSelect.objects.create( + question=question, + randomize=ms_data.get('randomize', False), + enumeration=ms_data.get('enumeration', 4), + style=ms_data.get('style', 2), + grading_type=ms_data.get('grading_type', 2) + ) + + for answer_data in ms_data.get('answers', []): + MultipleSelectAnswer.objects.create( + multiple_select=ms, + order=answer_data.get('order', 1), + answer=answer_data.get('answer', ''), + answer_feedback=answer_data.get('answer_feedback'), + is_correct=answer_data.get('is_correct', False) + ) + + def _create_matching_model(self, question, mat_data): + """Create Matching, MatchingChoice, and MatchingAnswer models.""" + matching = Matching.objects.create( + question=question, + grading_type=mat_data.get('grading_type', 0) + ) + + for choice_data in mat_data.get('choices', []): + matching_choice = MatchingChoice.objects.create( + matching=matching, + choice_text=choice_data.get('choice_text', '') + ) + + for answer_data in choice_data.get('matching_answers', []): + MatchingAnswer.objects.create( + matching_choice=matching_choice, + answer_text=answer_data.get('answer_text', '') + ) + + def _create_ordering_model(self, question, ord_data): + """Create Ordering models.""" + for item_data in ord_data.get('items', []): + Ordering.objects.create( + question=question, + text=item_data.get('text', ''), + order=item_data.get('order', 1), + ord_feedback=item_data.get('ord_feedback') + ) + + def _create_written_response_model(self, question, wr_data): + """Create WrittenResponse model.""" + WrittenResponse.objects.create( + question=question, + enable_student_editor=wr_data.get('enable_student_editor', False), + initial_text=wr_data.get('initial_text'), + answer_key=wr_data.get('answer_key', ''), + enable_attachments=wr_data.get('enable_attachments', False) + ) + + def format_to_markdown(self, question_library): + """ + Format parsed questions from Django models into markdown/text format + that matches the formatter_output structure (body text with questions). + This reconstructs the markdown that would have come from the original DOCX. + This can then be converted to DOCX using pandoc. + + Args: + question_library: QuestionLibrary Django model instance + + Returns: + str: Markdown formatted text (formatter_output format) ready for DOCX conversion + """ + lines = [] + + # Add main title as H1 heading if it exists + if question_library.main_title: + # Clean HTML from main title + main_title = question_library.main_title + try: + soup = BeautifulSoup(main_title, 'html.parser') + main_title = soup.get_text(separator=' ', strip=True) + except: + main_title = re.sub(r'\s+', ' ', main_title).strip() + lines.append(f"# {main_title}") + lines.append("") # Add blank line after title + + # Process sections + sections = question_library.get_sections() + for section in sections: + # Skip root section (is_main_content=True) - don't wrap it with #section markers + # Only wrap non-root sections with #section and /section markers + if not section.is_main_content: + lines.append("#section") + + # Add section title if present and should be displayed (## for markdown heading) + if section.title and section.is_title_displayed: + # Clean HTML from section title for display + section_title_display = section.title + try: + soup = BeautifulSoup(section_title_display, 'html.parser') + section_title_display = soup.get_text(separator=' ', strip=True) + except: + section_title_display = re.sub(r'\s+', ' ', section_title_display).strip() + lines.append(f"## {section_title_display}") + + # Add section text if present and should be displayed + if section.text and section.is_text_displayed: + # Convert HTML back to markdown if needed + section_text = section.text + lines.append(section_text) + + # Process questions in this section + questions = section.get_questions() + for question in questions: + question_markdown = self._format_question_to_markdown(question) + lines.append(question_markdown) + lines.append("") # Add blank line between questions + + # Close section marker for non-root sections + if not section.is_main_content: + lines.append("/section") + lines.append("") # Add blank line after section + + # Join with newlines and ensure proper formatting + result = "\n".join(lines) + if result and not result.endswith("\n"): + result += "\n" + return result + + def _format_question_to_markdown(self, question): + """ + Format a single question to markdown format matching the raw_content format + that the ANTLR questionparser expects. + Format: [number.] Type: ... Title: ... Points: ... [question text] [answers] [@Hint:] [@Feedback:] + """ + lines = [] + + # Question header: Type, Title, Points (each on separate line) + # Each header on its own line + if question.questiontype: + lines.append(f"Type: {question.questiontype}") + if question.title: + lines.append(f"Title: {question.title}") + if question.points: + lines.append(f"Points: {question.points}") + + # Add question text (HTML format from SCORM, convert to plain text) + # Prefix with question number if available (e.g., "1. Question text") + if question.text: + # Convert HTML to plain text if needed + question_text = question.text + # Remove HTML tags but keep content + import re + from bs4 import BeautifulSoup + try: + # Try to parse as HTML and extract text + soup = BeautifulSoup(question_text, 'html.parser') + question_text = soup.get_text(separator=' ', strip=True) + except: + # If not HTML, use as is but clean up extra whitespace + question_text = re.sub(r'\s+', ' ', question_text).strip() + + # Prefix with question number if available + question_number = None + if question.index is not None: + question_number = question.index + elif question.number_provided is not None: + question_number = question.number_provided + + if question_number is not None: + lines.append(f"{question_number}. {question_text}") + else: + lines.append(question_text) + + # Format question-specific content based on type + question_type = question.questiontype + if question_type == 'MC': + answer_text = self._format_multiple_choice_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == 'TF': + answer_text = self._format_true_false_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == 'FIB': + answer_text = self._format_fib_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == 'MS': + answer_text = self._format_multi_select_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == 'MAT': + answer_text = self._format_matching_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == 'ORD': + answer_text = self._format_ordering_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == 'WR': + answer_text = self._format_written_response_markdown(question) + if answer_text: + lines.append(answer_text) + + # Add hint if present (format: @Hint: or @HINT:) + if question.hint: + hint_text = question.hint + try: + soup = BeautifulSoup(hint_text, 'html.parser') + hint_text = soup.get_text(separator=' ', strip=True) + except: + hint_text = re.sub(r'\s+', ' ', hint_text).strip() + lines.append(f"@Hint: {hint_text}") + + # Add feedback if present (format: @Feedback: or @FEEDBACK:) + if question.feedback: + feedback_text = question.feedback + try: + soup = BeautifulSoup(feedback_text, 'html.parser') + feedback_text = soup.get_text(separator=' ', strip=True) + except: + feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() + lines.append(f"@Feedback: {feedback_text}") + + return "\n".join(lines) + + def _format_multiple_choice_markdown(self, question): + """ + Format multiple choice question answers. + Format: a. [answer text] or *a. [answer text] for correct answers + """ + lines = [] + mc = question.get_multiple_choice() + if mc: + answers = mc.get_multiple_choice_answers() + for idx, answer in enumerate(answers, start=1): + letter = chr(96 + idx) # a, b, c, etc. + # Correct answer has * before the letter (weight > 0) + marker = "*" if answer.weight and answer.weight > 0 else "" + # Clean HTML from answer text + answer_text = answer.answer + try: + soup = BeautifulSoup(answer_text, 'html.parser') + answer_text = soup.get_text(separator=' ', strip=True) + except: + answer_text = re.sub(r'\s+', ' ', answer_text).strip() + lines.append(f"{letter}. {marker}{answer_text}") + if answer.answer_feedback: + feedback_text = answer.answer_feedback + try: + soup = BeautifulSoup(feedback_text, 'html.parser') + feedback_text = soup.get_text(separator=' ', strip=True) + except: + feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() + lines.append(f"@Feedback: {feedback_text}") + return "\n".join(lines) + + def _format_true_false_markdown(self, question): + """ + Format true/false question answers. + Format: a. True / b. False with * after letter for correct answer (e.g., a. *True) + """ + lines = [] + tf = question.get_true_false() + if tf: + true_marker = "*" if tf.true_weight and tf.true_weight > 0 else "" + false_marker = "*" if tf.false_weight and tf.false_weight > 0 else "" + lines.append(f"a. {true_marker}True") + if tf.true_feedback: + lines.append(f"@Feedback: {tf.true_feedback}") + lines.append(f"b. {false_marker}False") + if tf.false_feedback: + lines.append(f"@Feedback: {tf.false_feedback}") + return "\n".join(lines) + + def _format_fib_markdown(self, question): + """ + Format fill in the blanks question. + Format: Question text with [blank] markers where answers go + """ + lines = [] + fibs = question.get_fibs() + current_text = "" + for fib in fibs: + if fib.type == 'fibquestion': + if fib.text: + current_text += fib.text + elif fib.type == 'fibanswer': + # Insert blank marker [ ] where the answer should be + current_text += "[ ]" + if current_text: + lines.append(current_text) + return "\n".join(lines) + + def _format_multi_select_markdown(self, question): + """ + Format multi-select question answers. + Format: a. [answer] or *a. [answer] for correct answers + """ + lines = [] + ms = question.get_multiple_select() + if ms: + answers = ms.get_multiple_select_answers() + for idx, answer in enumerate(answers, start=1): + letter = chr(96 + idx) # a, b, c, etc. + marker = "*" if answer.is_correct else "" + # Clean HTML from answer text + answer_text = answer.answer + try: + soup = BeautifulSoup(answer_text, 'html.parser') + answer_text = soup.get_text(separator=' ', strip=True) + except: + answer_text = re.sub(r'\s+', ' ', answer_text).strip() + lines.append(f"{letter}. {marker}{answer_text}") + if answer.answer_feedback: + feedback_text = answer.answer_feedback + try: + soup = BeautifulSoup(feedback_text, 'html.parser') + feedback_text = soup.get_text(separator=' ', strip=True) + except: + feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() + lines.append(f"@Feedback: {feedback_text}") + return "\n".join(lines) + + def _format_matching_markdown(self, question): + """ + Format matching question. + Format: choice_text = answer_text (one per line) + """ + lines = [] + matching = question.get_matching() + if matching: + choices = matching.get_matching_choices() + for choice in choices: + lines.append(f"{choice.choice_text} =") + answers = choice.get_matching_answers() + for answer in answers: + lines.append(answer.answer_text) + return "\n".join(lines) + + def _format_ordering_markdown(self, question): + """ + Format ordering question. + Format: numbered list (1., 2., 3., etc.) + """ + lines = [] + orderings = question.get_orderings() + for ordering in orderings: + lines.append(f"{ordering.order}. {ordering.text}") + if ordering.ord_feedback: + lines.append(f"@Feedback: {ordering.ord_feedback}") + return "\n".join(lines) + + def _format_written_response_markdown(self, question): + """ + Format written response question. + Format: Correct Answer: [answer key text] + """ + lines = [] + wr = question.get_written_response() + if wr and wr.answer_key: + lines.append(f"Correct Answer: {wr.answer_key}") + return "\n".join(lines) + + def convert_markdown_to_docx(self, markdown_text, output_path): + """ + Convert markdown text to DOCX file using pandoc (reverse of run_pandoc_task). + This is the final step to generate DOCX from the formatted markdown. + + Args: + markdown_text: Markdown formatted text (from format_to_markdown) + output_path: Path where the DOCX file should be saved + + Returns: + str: Path to the created DOCX file + """ + import pypandoc + import tempfile + import os + + # Create a temporary markdown file + with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as temp_md: + temp_md.write(markdown_text) + temp_md_path = temp_md.name + + try: + # Convert markdown to DOCX using pandoc (reverse of DOCX → markdown) + # Use similar settings as the forward conversion but in reverse + pypandoc.convert_file( + temp_md_path, + format='markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars', + to='docx+empty_paragraphs', + outputfile=output_path, + extra_args=[ + '--no-highlight', + '--preserve-tabs', + '--wrap=preserve', + '--indent=false', + '--mathml', + '--ascii' + ] + ) + finally: + # Clean up temporary markdown file + if os.path.exists(temp_md_path): + os.unlink(temp_md_path) + + return output_path diff --git a/api/serializers.py b/api/serializers.py index d8dbc47..4632eaf 100644 --- a/api/serializers.py +++ b/api/serializers.py @@ -12,18 +12,43 @@ def validate_docx_file(value): raise serializers.ValidationError("not a valid word file") +def validate_zip_file(value): + """Validate that uploaded file is a ZIP file.""" + if not value.name.endswith('.zip'): + raise serializers.ValidationError("not a valid zip file") + return value + + def count_errors(questionlibrary): + """ + Count document and question errors. + For reverse conversion (SCORM to JSON), errors are typically 0 since + we're not parsing with ANTLR which would generate errors. + """ # COUNT NUMBER OF DOCUMENT ERRORS - doc_errorlist = DocumentError.objects.filter(document=questionlibrary) - questionlibrary.total_document_errors = doc_errorlist.count() + # Check if DocumentError model exists (it may not be defined) + try: + from .models import DocumentError + doc_errorlist = DocumentError.objects.filter(document=questionlibrary) + questionlibrary.total_document_errors = doc_errorlist.count() + except (ImportError, AttributeError, NameError): + # DocumentError model doesn't exist, set to 0 + questionlibrary.total_document_errors = 0 # COUNT NUMBER OF QUESTION ERRORS - question_list = Question.objects.filter(question_library=questionlibrary) - num_question_errors = 0 - for q in question_list: - q_errorlist = QuestionError.objects.filter(question=q) - num_question_errors += q_errorlist.count() - questionlibrary.total_question_errors = num_question_errors + # Check if QuestionError model exists (it may not be defined) + try: + from .models import QuestionError + question_list = Question.objects.filter(section__question_library=questionlibrary) + num_question_errors = 0 + for q in question_list: + q_errorlist = QuestionError.objects.filter(question=q) + num_question_errors += q_errorlist.count() + questionlibrary.total_question_errors = num_question_errors + except (ImportError, AttributeError, NameError): + # QuestionError model doesn't exist, set to 0 + questionlibrary.total_question_errors = 0 + questionlibrary.save() @@ -56,6 +81,25 @@ def update(self, instance, validated_data): return instance +class ScormToJsonSerializer(serializers.Serializer): + """Serializer for SCORM ZIP file upload to convert to JSON (mirrors WordToJsonSerializer).""" + scorm_file = serializers.FileField(validators=[validate_zip_file], max_length=100, allow_empty_file=False, use_url=True) + + def create(self, validated_data): + newconversion = QuestionLibrary.objects.create() + newconversion.temp_file = validated_data.get('scorm_file', validated_data) + + # Set main title from filename + newconversion.main_title = newconversion.temp_file.name.split(".")[0] + newconversion.filter_main_title() + newconversion.folder_path = settings.MEDIA_ROOT + str(newconversion.id) + newconversion.image_path = newconversion.folder_path + settings.MEDIA_URL + newconversion.create_directory() + newconversion.save() + + return newconversion + + class JsonToScormSerializer(serializers.Serializer): json_data = serializers.JSONField(initial=dict) diff --git a/api/urls.py b/api/urls.py index b8971b4..f6d4e0b 100644 --- a/api/urls.py +++ b/api/urls.py @@ -9,4 +9,6 @@ urlpatterns = [ path('convert', views.WordToJson.as_view(), name='WordToJson'), path('package', views.JsonToScorm.as_view(), name='JsonToScorm'), + path('scorm-to-json', views.ScormToJson.as_view(), name='ScormToJson'), + path('json-to-docx', views.JsonToDocx.as_view(), name='JsonToDocx'), ] diff --git a/api/views.py b/api/views.py index 7e1e236..5007e8a 100644 --- a/api/views.py +++ b/api/views.py @@ -4,7 +4,7 @@ import json from rest_framework import viewsets -from .serializers import JsonToScormSerializer, QuestionLibraryPackageSerializer, WordToJsonSerializer +from .serializers import JsonToScormSerializer, QuestionLibraryPackageSerializer, WordToJsonSerializer, ScormToJsonSerializer from rest_framework import generics from rest_framework.views import APIView from rest_framework.response import Response @@ -220,6 +220,207 @@ def post(self, request, format=None): return JsonResponse({"hostname": settings.APP_VERSION, "serializer_errors": ql_serializer.errors}, status=400) + +class ScormToJson(APIView): + """ + Reverse API endpoint: Converts SCORM ZIP file to JSON (mirrors WordToJson). + This is step 1 of the reverse process: SCORM → JSON. + + Steps: + 1. Extract SCORM ZIP + 2. Parse XML (XmlReader) → populate Django models + 3. Serialize models to JSON using QuestionLibraryPackageSerializer + 4. Return JSON data + """ + parser_classes = [MultiPartParser] + permission_classes = [AllowAny] + authentication_classes = [TokenAuthenticationWithBearer] + serializer_class = ScormToJsonSerializer + + def post(self, request, format=None): + file_obj = request.data.get('scorm_file') + serializer = ScormToJsonSerializer(data={ + 'scorm_file': file_obj + }) + + if serializer.is_valid(): + instance = serializer.save() + + try: + # Step 1: Extract SCORM ZIP and parse XML using XmlReader + from .scorm.XmlReader import XmlReader + from os import path + + # Get the SCORM ZIP file path + scorm_zip_path = instance.temp_file.path + + # Extract and parse SCORM XML + xml_reader = XmlReader(scorm_zip_path, extract_to_path=path.join(instance.folder_path, 'scorm_extract')) + + # Step 2: Populate Django models from parsed XML + question_library = xml_reader.populate_django_models(instance) + + # Step 3: Serialize models to JSON (same format as WordToJson returns) + from .serializers import QuestionLibraryPackageSerializer + ql_serializer = QuestionLibraryPackageSerializer(question_library) + json_data = ql_serializer.data + + # Add error counts (similar to WordToJson) + from .serializers import count_errors + count_errors(question_library) + json_data['total_question_errors'] = str(question_library.total_question_errors or 0) + json_data['total_document_errors'] = str(question_library.total_document_errors or 0) + + instance.json_data = json_data + instance.save() + + logger.addFilter(QuestionlibraryFilenameFilter(instance)) + logger.info(f"[{instance.id}] SCORM to JSON conversion completed") + + instance.cleanup() + + return JsonResponse(json_data, status=200) + + except Exception as e: + logger.error(f"SCORM to JSON conversion failed: {str(e)}") + instance.cleanup() + return JsonResponse({"error": str(e)}, status=500) + + return JsonResponse(serializer.errors, status=400) + + +class JsonToDocx(APIView): + """ + Reverse API endpoint: Converts JSON to DOCX (mirrors JsonToScorm). + This is step 2 of the reverse process: JSON → DOCX. + + Steps: + 1. Deserialize JSON to Django models (using QuestionLibraryPackageSerializer) + 2. Convert models to markdown (format_to_markdown) + 3. Convert markdown to DOCX using Pandoc + 4. Return DOCX file + """ + parser_classes = [JSONParser] + permission_classes = [AllowAny] + authentication_classes = [TokenAuthenticationWithBearer] + serializer_class = JsonToScormSerializer + + def post(self, request, format=None): + json_data = request.data + + # Use the same serializer as JsonToScorm to deserialize JSON to models + ql_serializer = QuestionLibraryPackageSerializer(data=json_data.get('data', json_data)) + if ql_serializer.is_valid(): + ql_instance = ql_serializer.save() + ql_instance.filter_main_title() + ql_instance.folder_path = settings.MEDIA_ROOT + str(ql_instance.id) + ql_instance.image_path = ql_instance.folder_path + settings.MEDIA_URL + ql_instance.create_directory() + ql_instance.save() + + try: + # Step 1: Convert Django models to markdown (matching formatter_output format) + from .scorm.XmlReader import XmlReader + from os import path + import pypandoc + import re + + # Create XmlReader instance (we only need the format_to_markdown method) + # Since we don't need to parse XML, we create a minimal instance + xml_reader = object.__new__(XmlReader) # Create instance without calling __init__ + markdown_text = xml_reader.format_to_markdown(ql_instance) + + # Step 2: Convert markdown to DOCX using Pandoc (reverse of run_pandoc_task) + # Use main_title if it exists, otherwise use filtered_main_title + if ql_instance.main_title: + # Clean main_title for filename (remove invalid characters, limit length) + filename = ql_instance.main_title.strip() + filename = re.sub(r'[<>:"/\\|?*]', '', filename) # Remove invalid filename characters + filename = re.sub(r'\s+', '_', filename) # Replace spaces with underscores + filename = filename[:100] # Limit length + if not filename: + filename = ql_instance.filtered_main_title + else: + filename = ql_instance.filtered_main_title + + docx_filename = f"{filename}.docx" + docx_path = path.join(ql_instance.folder_path, docx_filename) + + # Convert markdown to DOCX + # Use similar settings as the forward conversion but in reverse + mdblockquotePath = "./pandoc/pandoc-filters/mdblockquote.lua" + emptyparaPath = "./pandoc/pandoc-filters/emptypara.lua" + + # Create temporary markdown file + temp_md_path = path.join(ql_instance.folder_path, "temp_markdown.md") + with open(temp_md_path, 'w', encoding='utf-8') as f: + f.write(markdown_text) + + try: + # Convert markdown to DOCX (reverse of DOCX → markdown) + # First convert markdown to HTML (intermediate step like forward conversion) + pandoc_md_to_html = pypandoc.convert_file( + temp_md_path, + format='markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars', + to='html+empty_paragraphs+tex_math_single_backslash', + extra_args=[ + '--no-highlight', + '--embed-resources', + '--markdown-headings=atx', + '--preserve-tabs', + '--wrap=preserve', + '--indent=false', + '--mathml', + '--ascii', + '--lua-filter=' + mdblockquotePath, + '--lua-filter=' + emptyparaPath, + ] + ) + + # Then convert HTML to DOCX + pypandoc.convert_text( + pandoc_md_to_html, + format='html+empty_paragraphs', + to='docx+empty_paragraphs', + outputfile=docx_path, + extra_args=[ + '--no-highlight', + '--preserve-tabs', + '--wrap=preserve', + '--indent=false', + '--mathml', + '--ascii', + ] + ) + finally: + # Clean up temporary markdown file + if path.exists(temp_md_path): + from os import remove + remove(temp_md_path) + + # Step 3: Return DOCX file + from django.core.files import File + with open(docx_path, 'rb') as f: + ql_instance.temp_file.save(docx_filename, File(f), save=True) + + file_response = FileResponse(ql_instance.temp_file) + file_response['Content-Disposition'] = f'attachment; filename="{docx_filename}"' + + logger.addFilter(QuestionlibraryFilenameFilter(ql_instance)) + logger.info(f"[{ql_instance.id}] JSON to DOCX conversion completed") + + ql_instance.cleanup() + + return file_response + + except Exception as e: + logger.error(f"JSON to DOCX conversion failed: {str(e)}") + ql_instance.cleanup() + return JsonResponse({"error": str(e)}, status=500) + + return JsonResponse({"hostname": settings.APP_VERSION, "serializer_errors": ql_serializer.errors}, status=400) + + class RootPath(APIView): permission_classes = [AllowAny] diff --git a/requirements.txt b/requirements.txt index e60f977..c898960 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ python-dotenv==0.21.0 channels==3.0.5 daphne==3.0.2 Twisted[tls,http2]==22.8.0 -celery==5.2.7 +celery==5.3.6 redis==4.3.4 channels-redis==4.0.0 psycopg2-binary==2.9.5 From 260207f10211cc0abe26c0023bf1284d34e467bb Mon Sep 17 00:00:00 2001 From: Arvin Rolos Date: Thu, 8 Jan 2026 13:36:36 -0800 Subject: [PATCH 2/8] Improve SCORM to JSON/DOCX conversion: add CDATA whitespace cleaning, fix parsing for Matching/Ordering questions, enhance markdown formatting with proper indentation and hard breaks --- api/scorm/XmlReader.py | 354 ++++++++++++++++++++++++++++++----------- api/serializers.py | 18 +++ 2 files changed, 283 insertions(+), 89 deletions(-) diff --git a/api/scorm/XmlReader.py b/api/scorm/XmlReader.py index 79bb591..162f1d2 100644 --- a/api/scorm/XmlReader.py +++ b/api/scorm/XmlReader.py @@ -289,7 +289,10 @@ def _parse_question(self, item_el): return question_data def _extract_text_from_material(self, material_el): - """Extract text content from material element, handling CDATA.""" + """ + Extract text content from material element, handling CDATA. + Automatically cleans CDATA whitespace and HTML tags. + """ text_parts = [] # Navigate through flow_mat -> material -> mattext @@ -300,16 +303,21 @@ def _extract_text_from_material(self, material_el): mattext = material.find('mattext') if mattext is not None: # Get text content (handles CDATA) - text = mattext.text if mattext.text else '' + raw_text = mattext.text if mattext.text else '' # Also check for CDATA in tail if mattext.tail: - text += mattext.tail - text_parts.append(text) + raw_text += mattext.tail + # Clean CDATA whitespace while preserving HTML tags + cleaned_text = self._clean_cdata_text(raw_text) + text_parts.append(cleaned_text) - return ''.join(text_parts).strip() + return ''.join(text_parts) def _extract_question_text(self, presentation_el): - """Extract question text from presentation element.""" + """ + Extract question text from presentation element. + Automatically cleans CDATA whitespace and HTML tags. + """ text_parts = [] flow = presentation_el.find('flow') @@ -319,12 +327,14 @@ def _extract_question_text(self, presentation_el): if material is not None: mattext = material.find('mattext') if mattext is not None: - text = mattext.text if mattext.text else '' + raw_text = mattext.text if mattext.text else '' if mattext.tail: - text += mattext.tail - text_parts.append(text) + raw_text += mattext.tail + # Clean CDATA whitespace while preserving HTML tags + cleaned_text = self._clean_cdata_text(raw_text) + text_parts.append(cleaned_text) - return ''.join(text_parts).strip() + return ''.join(text_parts) def _extract_text_from_hint(self, hint_el): """Extract text from hint element.""" @@ -334,14 +344,55 @@ def _extract_text_from_hint(self, hint_el): return None def _extract_text_from_feedback(self, feedback_el): - """Extract text from feedback element.""" + """ + Extract text from feedback element. + Automatically cleans CDATA whitespace while preserving HTML tags. + """ material = feedback_el.find('material') if material is not None: mattext = material.find('mattext') if mattext is not None: - return mattext.text if mattext.text else '' + raw_text = mattext.text if mattext.text else '' + # Clean CDATA whitespace while preserving HTML tags + return self._clean_cdata_text(raw_text) return None + def _clean_cdata_text(self, text): + """ + Clean text extracted from CDATA sections in SCORM XML. + + SCORM XML often contains CDATA with excessive whitespace, newlines, and tabs + that are formatting artifacts rather than meaningful content. This method: + 1. Preserves HTML tags (e.g.,

, , etc.) + 2. Normalizes whitespace between HTML tags (multiple spaces/newlines/tabs -> single space) + 3. Trims leading/trailing whitespace + + This ensures clean JSON output while preserving HTML structure for proper rendering. + + Args: + text: Raw text string from XML CDATA + + Returns: + str: Cleaned text with normalized whitespace but HTML tags preserved + """ + if not text: + return '' + + try: + # Normalize whitespace while preserving HTML tags + # Replace sequences of whitespace (spaces, tabs, newlines) with a single space + # But be careful not to break HTML tag structure + cleaned = re.sub(r'[ \t\n\r]+', ' ', text) + # Remove whitespace between HTML tags (e.g., "> <" -> "><") + cleaned = re.sub(r'>\s+<', '><', cleaned) + # Trim leading/trailing whitespace + cleaned = cleaned.strip() + return cleaned + except Exception: + # Fallback: if regex fails, just normalize whitespace + cleaned = re.sub(r'\s+', ' ', text).strip() + return cleaned + def _parse_multiple_choice(self, item_el, question_ident): """ Parse multiple choice question data. @@ -390,7 +441,9 @@ def _parse_multiple_choice(self, item_el, question_ident): mattext = response_label.find('.//mattext') answer_text = '' if mattext is not None: - answer_text = mattext.text if mattext.text else '' + raw_text = mattext.text if mattext.text else '' + # Clean CDATA whitespace while preserving HTML tags + answer_text = self._clean_cdata_text(raw_text) # Find weight from resprocessing weight = 0.0 @@ -546,6 +599,7 @@ def _parse_fill_in_the_blanks(self, item_el, question_ident): mattext = child.find('mattext') text = '' if mattext is not None: + # Don't clean CDATA for FIB - preserve original spacing text = mattext.text if mattext.text else '' fib_data['fibs'].append({ @@ -640,7 +694,9 @@ def _parse_multi_select(self, item_el, question_ident): mattext = response_label.find('.//mattext') answer_text = '' if mattext is not None: - answer_text = mattext.text if mattext.text else '' + raw_text = mattext.text if mattext.text else '' + # Clean CDATA whitespace while preserving HTML tags + answer_text = self._clean_cdata_text(raw_text) # Determine if correct from resprocessing is_correct = False @@ -705,14 +761,28 @@ def _parse_matching(self, item_el, question_ident): except (ValueError, TypeError): pass - # Collect all unique matching answers first (from render_choice) + # Collect all unique matching answers first (from all render_choices) matching_answers = {} - answer_index = 1 - question_ident_answer = question_ident + "_A" # Find all response_grp elements (one per choice) response_grps = flow.findall('response_grp') + # First pass: collect all possible answers from all choices + for response_grp in response_grps: + render_choice = response_grp.find('render_choice') + if render_choice is not None: + # Find all response_label elements directly (they may all be in one flow_label) + for response_label in render_choice.findall('.//response_label'): + answer_ident = response_label.get('ident', '') + mattext = response_label.find('.//mattext') + if mattext is not None: + raw_text = mattext.text if mattext.text else '' + # Clean CDATA whitespace while preserving HTML tags + answer_text = self._clean_cdata_text(raw_text) + if answer_text and answer_ident not in matching_answers: + matching_answers[answer_ident] = answer_text + + # Second pass: process each choice and find its correct answer for response_grp in response_grps: choice_ident = response_grp.get('respident', '') @@ -722,22 +792,9 @@ def _parse_matching(self, item_el, question_ident): if material is not None: mattext = material.find('mattext') if mattext is not None: - choice_text = mattext.text if mattext.text else '' - - # Get the render_choice to find available answers - render_choice = response_grp.find('render_choice') - matching_answer_texts = [] - - if render_choice is not None: - for flow_label in render_choice.findall('.//flow_label'): - response_label = flow_label.find('response_label') - if response_label is not None: - answer_ident = response_label.get('ident', '') - mattext = response_label.find('.//mattext') - if mattext is not None: - answer_text = mattext.text if mattext.text else '' - if answer_text and answer_ident not in matching_answers: - matching_answers[answer_ident] = answer_text + raw_text = mattext.text if mattext.text else '' + # Clean CDATA whitespace while preserving HTML tags + choice_text = self._clean_cdata_text(raw_text) # Find correct answer from resprocessing correct_answer_ident = None @@ -751,6 +808,7 @@ def _parse_matching(self, item_el, question_ident): setvar = respcondition.find('setvar') if setvar is not None and setvar.get('varname') == 'D2L_Correct': correct_answer_ident = varequal.text + break # Found the correct answer for this choice # Build matching answers list for this choice matching_answers_list = [] @@ -793,32 +851,33 @@ def _parse_ordering(self, item_el, question_ident): return ord_data # Parse ordering items + # Find all response_label elements directly (they may all be in one flow_label) order_index = 1 - for flow_label in render_choice.findall('.//flow_label'): - response_label = flow_label.find('response_label') - if response_label is not None: - ident_num = response_label.get('ident', '') - - # Extract text - mattext = response_label.find('.//mattext') - text = '' - if mattext is not None: - text = mattext.text if mattext.text else '' - - # Find feedback - ord_feedback = None - question_ident_feedback = question_ident + "_IF" - feedback_ident = question_ident_feedback + str(order_index) - feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") - if feedback_el is not None: - ord_feedback = self._extract_text_from_feedback(feedback_el) - - ord_data['items'].append({ - 'text': text, - 'order': order_index, - 'ord_feedback': ord_feedback - }) - order_index += 1 + for response_label in render_choice.findall('.//response_label'): + ident_num = response_label.get('ident', '') + + # Extract text + mattext = response_label.find('.//mattext') + text = '' + if mattext is not None: + raw_text = mattext.text if mattext.text else '' + # Clean CDATA whitespace while preserving HTML tags + text = self._clean_cdata_text(raw_text) + + # Find feedback + ord_feedback = None + question_ident_feedback = question_ident + "_IF" + feedback_ident = question_ident_feedback + str(order_index) + feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") + if feedback_el is not None: + ord_feedback = self._extract_text_from_feedback(feedback_el) + + ord_data['items'].append({ + 'text': text, + 'order': order_index, + 'ord_feedback': ord_feedback + }) + order_index += 1 return ord_data @@ -853,7 +912,9 @@ def _parse_written_response(self, item_el, question_ident): if answer_key_mat is not None: mattext = answer_key_mat.find('.//mattext') if mattext is not None: - wr_data['answer_key'] = mattext.text if mattext.text else '' + raw_text = mattext.text if mattext.text else '' + # Clean CDATA whitespace while preserving HTML tags + wr_data['answer_key'] = self._clean_cdata_text(raw_text) # Parse initial_text (if present) initial_text_el = item_el.find('initial_text') @@ -862,7 +923,10 @@ def _parse_written_response(self, item_el, question_ident): if initial_text_mat is not None: mattext = initial_text_mat.find('.//mattext') if mattext is not None: - wr_data['initial_text'] = mattext.text if mattext.text else None + raw_text = mattext.text if mattext.text else '' + # Clean CDATA whitespace while preserving HTML tags + cleaned_text = self._clean_cdata_text(raw_text) + wr_data['initial_text'] = cleaned_text if cleaned_text else None return wr_data @@ -1184,11 +1248,14 @@ def _format_question_to_markdown(self, question): if question.title: lines.append(f"Title: {question.title}") if question.points: - lines.append(f"Points: {question.points}") + # Normalize points: remove trailing zeros and decimal if not needed (e.g., 1.0000 -> 1, 1.5 -> 1.5) + normalized_points = str(float(question.points)).rstrip('0').rstrip('.') + lines.append(f"Points: {normalized_points}") # Add question text (HTML format from SCORM, convert to plain text) # Prefix with question number if available (e.g., "1. Question text") - if question.text: + # Note: For FIB questions, skip displaying question.text here since FIB formatting includes all text parts + if question.text and question.questiontype != 'FIB': # Convert HTML to plain text if needed question_text = question.text # Remove HTML tags but keep content @@ -1227,7 +1294,17 @@ def _format_question_to_markdown(self, question): elif question_type == 'FIB': answer_text = self._format_fib_markdown(question) if answer_text: - lines.append(answer_text) + # For FIB questions, prefix with question number since we skipped question.text above + question_number = None + if question.index is not None: + question_number = question.index + elif question.number_provided is not None: + question_number = question.number_provided + + if question_number is not None: + lines.append(f"{question_number}. {answer_text}") + else: + lines.append(answer_text) elif question_type == 'MS': answer_text = self._format_multi_select_markdown(question) if answer_text: @@ -1265,7 +1342,8 @@ def _format_question_to_markdown(self, question): feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() lines.append(f"@Feedback: {feedback_text}") - return "\n".join(lines) + # Use double newlines so each logical line becomes a paragraph (hard breaks, not soft) + return "\n\n".join(lines) def _format_multiple_choice_markdown(self, question): """ @@ -1287,7 +1365,8 @@ def _format_multiple_choice_markdown(self, question): answer_text = soup.get_text(separator=' ', strip=True) except: answer_text = re.sub(r'\s+', ' ', answer_text).strip() - lines.append(f"{letter}. {marker}{answer_text}") + # Indent as level 2 list (4 spaces for markdown level 2) + lines.append(f" {letter}. {marker}{answer_text}") if answer.answer_feedback: feedback_text = answer.answer_feedback try: @@ -1295,7 +1374,7 @@ def _format_multiple_choice_markdown(self, question): feedback_text = soup.get_text(separator=' ', strip=True) except: feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() - lines.append(f"@Feedback: {feedback_text}") + lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) def _format_true_false_markdown(self, question): @@ -1308,18 +1387,21 @@ def _format_true_false_markdown(self, question): if tf: true_marker = "*" if tf.true_weight and tf.true_weight > 0 else "" false_marker = "*" if tf.false_weight and tf.false_weight > 0 else "" - lines.append(f"a. {true_marker}True") + # Indent as level 2 list (4 spaces for markdown level 2) + lines.append(f" a. {true_marker}True") if tf.true_feedback: - lines.append(f"@Feedback: {tf.true_feedback}") - lines.append(f"b. {false_marker}False") + lines.append(f" @Feedback: {tf.true_feedback}") + lines.append(f" b. {false_marker}False") if tf.false_feedback: - lines.append(f"@Feedback: {tf.false_feedback}") + lines.append(f" @Feedback: {tf.false_feedback}") return "\n".join(lines) def _format_fib_markdown(self, question): """ Format fill in the blanks question. - Format: Question text with [blank] markers where answers go + Format: Question text with [answer] markers where answers go + Example: "A [rose,flower] by any other name would smell as [sweet,good]." + Note: Clean HTML tags but preserve spacing (CDATA cleaning was skipped during parsing). """ lines = [] fibs = question.get_fibs() @@ -1327,10 +1409,21 @@ def _format_fib_markdown(self, question): for fib in fibs: if fib.type == 'fibquestion': if fib.text: - current_text += fib.text + # Clean HTML tags but preserve spacing + from bs4 import BeautifulSoup + try: + soup = BeautifulSoup(fib.text, 'html.parser') + cleaned_text = soup.get_text(separator=' ', strip=False) + current_text += cleaned_text + except Exception: + # Fallback: use text as-is if BeautifulSoup fails + current_text += fib.text elif fib.type == 'fibanswer': - # Insert blank marker [ ] where the answer should be - current_text += "[ ]" + # Insert answer in brackets [answer] where the blank should be + if fib.text: + current_text += f" [{fib.text}]" + else: + current_text += " [ ]" if current_text: lines.append(current_text) return "\n".join(lines) @@ -1354,7 +1447,8 @@ def _format_multi_select_markdown(self, question): answer_text = soup.get_text(separator=' ', strip=True) except: answer_text = re.sub(r'\s+', ' ', answer_text).strip() - lines.append(f"{letter}. {marker}{answer_text}") + # Indent as level 2 list (4 spaces for markdown level 2) + lines.append(f" {letter}. {marker}{answer_text}") if answer.answer_feedback: feedback_text = answer.answer_feedback try: @@ -1362,48 +1456,130 @@ def _format_multi_select_markdown(self, question): feedback_text = soup.get_text(separator=' ', strip=True) except: feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() - lines.append(f"@Feedback: {feedback_text}") + lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) def _format_matching_markdown(self, question): """ Format matching question. - Format: choice_text = answer_text (one per line) + Format: a. choice_text = answer_text (on same line, with enumeration) + Preserves inline HTML styling (bold, italic, etc.) but removes block-level tags (p, div, etc.) """ lines = [] matching = question.get_matching() if matching: choices = matching.get_matching_choices() - for choice in choices: - lines.append(f"{choice.choice_text} =") - answers = choice.get_matching_answers() - for answer in answers: - lines.append(answer.answer_text) + for idx, choice in enumerate(choices, start=1): + letter = chr(96 + idx) # a, b, c, etc. + + # Remove block-level HTML tags but preserve inline styling + choice_text = self._remove_block_tags_preserve_inline(choice.choice_text) + + # Use the related manager matching_answers (from ForeignKey in MatchingAnswer) + answers = choice.matching_answers.all() + if answers: + # Get the first matching answer (typically there's one per choice) + answer = answers[0] + answer_text = self._remove_block_tags_preserve_inline(answer.answer_text) + # Indent as level 2 list (4 spaces for markdown level 2) + lines.append(f" {letter}. {choice_text} = {answer_text}") + else: + # No answer found, just show choice + lines.append(f" {letter}. {choice_text} =") return "\n".join(lines) + def _remove_block_tags_preserve_inline(self, html_text): + """ + Remove block-level HTML tags (p, div, etc.) but preserve inline styling tags (strong, em, b, i, etc.). + This allows formatting like bold/italic to be preserved while removing tags that cause line breaks. + Returns HTML string with inline tags preserved. + """ + if not html_text: + return '' + + try: + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_text, 'html.parser') + + # Unwrap block-level tags (these cause line breaks) but preserve their content and inline tags + block_tags = ['p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ul', 'ol'] + for tag_name in block_tags: + for tag in soup.find_all(tag_name): + # Unwrap removes the tag but keeps its content (including inline tags) + tag.unwrap() + + # Get the HTML string with inline tags preserved + result = str(soup) + # Clean up: remove leading/trailing whitespace and normalize internal whitespace + # But preserve HTML tag structure + result = re.sub(r'>\s+<', '><', result) # Remove whitespace between tags + result = re.sub(r'\s+', ' ', result) # Normalize whitespace + result = result.strip() + return result + except Exception: + # Fallback: if parsing fails, just clean whitespace but preserve HTML structure + cleaned = re.sub(r'>\s+<', '><', html_text) + cleaned = re.sub(r'\s+', ' ', cleaned).strip() + return cleaned + def _format_ordering_markdown(self, question): """ Format ordering question. - Format: numbered list (1., 2., 3., etc.) + Format: lettered list (a., b., c., etc.) with HTML tags cleaned, indented as level 2 list """ lines = [] orderings = question.get_orderings() - for ordering in orderings: - lines.append(f"{ordering.order}. {ordering.text}") + for idx, ordering in enumerate(orderings, start=1): + letter = chr(96 + idx) # a, b, c, etc. + # Clean HTML from ordering text + ordering_text = ordering.text + try: + from bs4 import BeautifulSoup + soup = BeautifulSoup(ordering_text, 'html.parser') + ordering_text = soup.get_text(separator=' ', strip=True) + except: + import re + ordering_text = re.sub(r'\s+', ' ', ordering_text).strip() + # Indent as level 2 list (4 spaces for markdown level 2) + lines.append(f" {letter}. {ordering_text}") if ordering.ord_feedback: - lines.append(f"@Feedback: {ordering.ord_feedback}") + feedback_text = ordering.ord_feedback + try: + from bs4 import BeautifulSoup + soup = BeautifulSoup(feedback_text, 'html.parser') + feedback_text = soup.get_text(separator=' ', strip=True) + except: + import re + feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() + lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) def _format_written_response_markdown(self, question): """ Format written response question. - Format: Correct Answer: [answer key text] + Format: Blank line, then "Correct Answer:" indented, then indented answer text. + Use double newlines to ensure hard paragraph breaks (not soft returns) in DOCX. """ lines = [] wr = question.get_written_response() if wr and wr.answer_key: - lines.append(f"Correct Answer: {wr.answer_key}") - return "\n".join(lines) + # Add blank line first (double newline for hard paragraph break) + lines.append("") + # Clean HTML from answer text + answer_text = wr.answer_key + try: + from bs4 import BeautifulSoup + soup = BeautifulSoup(answer_text, 'html.parser') + answer_text = soup.get_text(separator=' ', strip=True) + except: + import re + answer_text = re.sub(r'\s+', ' ', answer_text).strip() + # Indent with regular spaces (3 for label, 7 for answer) to mimic margin + # Avoid 4+ leading spaces to prevent markdown list or code block detection + lines.append(f"Correct Answer:") + lines.append(f"{answer_text}") + # Use double newlines so each logical line becomes a paragraph (hard breaks) + return "\n\n".join(lines) def convert_markdown_to_docx(self, markdown_text, output_path): """ diff --git a/api/serializers.py b/api/serializers.py index 4632eaf..acea5aa 100644 --- a/api/serializers.py +++ b/api/serializers.py @@ -239,6 +239,15 @@ class QuestionSerializer(serializers.ModelSerializer): matching = MatchingSerializer(many=True, allow_null=True) ordering = serializers.SerializerMethodField() written_response = WrittenResponseSerializer(many=True, allow_null=True) + points = serializers.SerializerMethodField() + + def get_points(self, obj): + """Normalize points: remove trailing zeros and decimal if not needed (e.g., 1.0000 -> '1', 1.5 -> '1.5')""" + if obj.points is None: + return None + # Convert to normalized string: remove trailing zeros and decimal point if not needed + normalized = str(float(obj.points)).rstrip('0').rstrip('.') + return normalized if normalized else '0' def get_fib(self, question): ordering_queryset = question.get_fibs() @@ -306,6 +315,15 @@ class QuestionPackageSerializer(serializers.ModelSerializer): matching = MatchingSerializer(many=True, allow_null=True) ordering = OrderingSerializer(many=True, allow_null=True) written_response = WrittenResponseSerializer(many=True, allow_null=True) + points = serializers.SerializerMethodField() + + def get_points(self, obj): + """Normalize points: remove trailing zeros and decimal if not needed (e.g., 1.0000 -> '1', 1.5 -> '1.5')""" + if obj.points is None: + return None + # Convert to normalized string: remove trailing zeros and decimal point if not needed + normalized = str(float(obj.points)).rstrip('0').rstrip('.') + return normalized if normalized else '0' class Meta: model = Question From 0b936b15a86c919cdc5490f63d37e824feab1885 Mon Sep 17 00:00:00 2001 From: Arvin Rolos Date: Mon, 12 Jan 2026 11:09:37 -0800 Subject: [PATCH 3/8] Handle SCORM line breaks, math, and inline images in Pandoc flow --- api/scorm/XmlReader.py | 335 ++++++++++++++++++++-------- api/tasks.py | 2 + api/views.py | 210 ++++++++++++++--- pandoc/pandoc-filters/linebreak.lua | 11 + restapi/tasks.py | 2 + 5 files changed, 429 insertions(+), 131 deletions(-) create mode 100644 pandoc/pandoc-filters/linebreak.lua diff --git a/api/scorm/XmlReader.py b/api/scorm/XmlReader.py index 162f1d2..11fbc9e 100644 --- a/api/scorm/XmlReader.py +++ b/api/scorm/XmlReader.py @@ -9,6 +9,7 @@ from django.conf import settings from bs4 import BeautifulSoup import re +import base64 from api.models import ( QuestionLibrary, Section, Question, MultipleChoice, MultipleChoiceAnswer, @@ -292,6 +293,7 @@ def _extract_text_from_material(self, material_el): """ Extract text content from material element, handling CDATA. Automatically cleans CDATA whitespace and HTML tags. + Converts SCORM image file paths to base64 data URIs. """ text_parts = [] @@ -309,6 +311,8 @@ def _extract_text_from_material(self, material_el): raw_text += mattext.tail # Clean CDATA whitespace while preserving HTML tags cleaned_text = self._clean_cdata_text(raw_text) + # Convert SCORM image file paths to base64 + cleaned_text = self._convert_scorm_images_to_base64(cleaned_text) text_parts.append(cleaned_text) return ''.join(text_parts) @@ -317,6 +321,7 @@ def _extract_question_text(self, presentation_el): """ Extract question text from presentation element. Automatically cleans CDATA whitespace and HTML tags. + Converts SCORM image file paths to base64 data URIs. """ text_parts = [] @@ -332,6 +337,8 @@ def _extract_question_text(self, presentation_el): raw_text += mattext.tail # Clean CDATA whitespace while preserving HTML tags cleaned_text = self._clean_cdata_text(raw_text) + # Convert SCORM image file paths to base64 + cleaned_text = self._convert_scorm_images_to_base64(cleaned_text) text_parts.append(cleaned_text) return ''.join(text_parts) @@ -347,6 +354,7 @@ def _extract_text_from_feedback(self, feedback_el): """ Extract text from feedback element. Automatically cleans CDATA whitespace while preserving HTML tags. + Converts SCORM image file paths to base64 data URIs. """ material = feedback_el.find('material') if material is not None: @@ -354,7 +362,9 @@ def _extract_text_from_feedback(self, feedback_el): if mattext is not None: raw_text = mattext.text if mattext.text else '' # Clean CDATA whitespace while preserving HTML tags - return self._clean_cdata_text(raw_text) + cleaned_text = self._clean_cdata_text(raw_text) + # Convert SCORM image file paths to base64 + return self._convert_scorm_images_to_base64(cleaned_text) return None def _clean_cdata_text(self, text): @@ -393,6 +403,203 @@ def _clean_cdata_text(self, text): cleaned = re.sub(r'\s+', ' ', text).strip() return cleaned + def _convert_scorm_images_to_base64(self, html_text): + """ + Convert SCORM image file paths to base64 data URIs in HTML text. + + SCORM packages store images as files (e.g., ./assessment-assets/.../image_1.png) + in the ZIP. This method extracts those images and converts them to base64 + data URIs so the JSON is self-contained. + + Args: + html_text: HTML text containing tags with file paths + + Returns: + str: HTML text with image file paths replaced with base64 data URIs + """ + if not html_text or not self.extracted_path: + return html_text + + # Find all img tags with src attributes + img_pattern = r']*?)src=["\']([^"\']+)["\']([^>]*?)>' + + def replace_image(match): + before_src = match.group(1) + img_src = match.group(2) + after_src = match.group(3) + + # Skip if already base64 or data URI + if img_src.startswith('data:') or 'base64' in img_src: + return match.group(0) + + # Skip if absolute URL + if img_src.startswith('http://') or img_src.startswith('https://'): + return match.group(0) + + try: + # Extract image path (remove leading ./ if present) + img_path = img_src.lstrip('./') + + # Try to find the image file in the extracted SCORM directory + # SCORM images are typically in assessment-assets folder + possible_paths = [ + path.join(self.extracted_path, img_path), + path.join(self.extracted_path, 'assessment-assets', path.basename(img_path)), + ] + + # Also try to find in any subdirectory + image_file = None + for possible_path in possible_paths: + if path.exists(possible_path) and path.isfile(possible_path): + image_file = possible_path + break + + # If not found, search recursively + if not image_file: + for root, dirs, files in os.walk(self.extracted_path): + if path.basename(img_path) in files: + image_file = path.join(root, path.basename(img_path)) + break + + if image_file and path.exists(image_file): + # Read image file and convert to base64 + with open(image_file, 'rb') as f: + image_data = f.read() + base64_data = base64.b64encode(image_data).decode('utf-8') + + # Determine MIME type from file extension + ext = path.splitext(image_file)[1].lower() + mime_types = { + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.svg': 'image/svg+xml', + '.webp': 'image/webp' + } + mime_type = mime_types.get(ext, 'image/png') + + # Replace with base64 data URI + base64_src = f'data:{mime_type};base64,{base64_data}' + import logging + logger = logging.getLogger(__name__) + logger.info(f"Converted SCORM image {path.basename(image_file)} to base64 ({len(base64_data)} chars)") + return f'' + else: + # Image file not found, log warning + import logging + logger = logging.getLogger(__name__) + logger.warning(f"SCORM image not found: {img_src} (searched in {self.extracted_path})") + # Return original img tag (will show as broken image or alt text) + return match.group(0) + except Exception as e: + # If any error occurs, return original img tag + return match.group(0) + + # Replace all img tags + result = re.sub(img_pattern, replace_image, html_text) + return result + + def _convert_html_with_base64_images_to_markdown(self, html_text): + """ + Convert HTML text with base64 images to markdown format. + + Preserves ALL tags as HTML (Pandoc supports HTML in markdown). + Converts remaining HTML to plain text. + + Args: + html_text: HTML text containing image tags (base64 or file paths) + + Returns: + str: Markdown formatted text with img tags preserved as HTML + """ + if not html_text: + return '' + + # Extract ALL img tags (both base64 and file paths) and preserve them as HTML + # Pattern to match any img tag + img_pattern = r']*?>' + + # Store HTML img tags temporarily with placeholders + html_images = {} + image_counter = 0 + + def preserve_img_tag(match): + nonlocal image_counter + # Preserve the entire img tag as HTML + full_img_tag = match.group(0) + placeholder = f'__HTML_IMAGE_{image_counter}__' + html_images[placeholder] = full_img_tag + image_counter += 1 + return placeholder + + # Preserve MathML blocks with placeholders so we can convert them to TeX + math_blocks = {} + math_counter = 0 + math_pattern = r'' + + def preserve_math(match): + nonlocal math_counter + full_math = match.group(0) + placeholder = f'__MATH_BLOCK_{math_counter}__' + # Try to extract TeX from annotation + tex_match = re.search( + r']*encoding=["\']application/x-tex["\'][^>]*>(.*?)', + full_math, + flags=re.IGNORECASE | re.DOTALL + ) + tex = tex_match.group(1) if tex_match else None + math_blocks[placeholder] = {"tex": tex, "raw": full_math} + math_counter += 1 + return placeholder + + # Replace all img tags and math blocks with placeholders + result = re.sub(img_pattern, preserve_img_tag, html_text) + result = re.sub(math_pattern, preserve_math, result, flags=re.IGNORECASE) + + # Convert remaining HTML to plain text using BeautifulSoup + try: + soup = BeautifulSoup(result, 'html.parser') + # Replace
with a placeholder so only those become hard breaks + for br in soup.find_all('br'): + br.replace_with('[[[BR]]]') + # Extract text while keeping other inline tags tight (no extra newlines) + text = soup.get_text(separator=' ', strip=False) + # Turn our placeholders into real newlines + text = text.replace('[[[BR]]]', '\n') + except Exception: + # Fallback: if BeautifulSoup fails, just clean up HTML tags manually + # But preserve placeholders + text = re.sub(r'<(?!/?__HTML_IMAGE_)[^>]+>', '', result) + + # Restore MathML (prefer TeX) and HTML img tags from placeholders + for placeholder, math_info in math_blocks.items(): + replacement = None + if math_info.get("tex"): + tex = math_info["tex"].strip() + replacement = f"$$ {tex} $$" + else: + replacement = math_info.get("raw", "") + text = text.replace(placeholder, replacement) + for placeholder, html_img in html_images.items(): + text = text.replace(placeholder, html_img) + + # Normalize whitespace on each line but keep explicit newlines and math/img blocks + text = text.replace('\r', '') + normalized_lines = [] + for line in text.split('\n'): + stripped = line.strip() + if stripped == '': + normalized_lines.append('') + continue + if re.search(r']*>', stripped, flags=re.IGNORECASE) or re.search(r']+>', '', plain_text) # Remove any remaining HTML tags + plain_text = re.sub(r'\s+', ' ', plain_text).strip() # Prefix with question number if available question_number = None @@ -1324,22 +1526,12 @@ def _format_question_to_markdown(self, question): # Add hint if present (format: @Hint: or @HINT:) if question.hint: - hint_text = question.hint - try: - soup = BeautifulSoup(hint_text, 'html.parser') - hint_text = soup.get_text(separator=' ', strip=True) - except: - hint_text = re.sub(r'\s+', ' ', hint_text).strip() + hint_text = self._convert_html_with_base64_images_to_markdown(question.hint) lines.append(f"@Hint: {hint_text}") # Add feedback if present (format: @Feedback: or @FEEDBACK:) if question.feedback: - feedback_text = question.feedback - try: - soup = BeautifulSoup(feedback_text, 'html.parser') - feedback_text = soup.get_text(separator=' ', strip=True) - except: - feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() + feedback_text = self._convert_html_with_base64_images_to_markdown(question.feedback) lines.append(f"@Feedback: {feedback_text}") # Use double newlines so each logical line becomes a paragraph (hard breaks, not soft) @@ -1358,22 +1550,12 @@ def _format_multiple_choice_markdown(self, question): letter = chr(96 + idx) # a, b, c, etc. # Correct answer has * before the letter (weight > 0) marker = "*" if answer.weight and answer.weight > 0 else "" - # Clean HTML from answer text - answer_text = answer.answer - try: - soup = BeautifulSoup(answer_text, 'html.parser') - answer_text = soup.get_text(separator=' ', strip=True) - except: - answer_text = re.sub(r'\s+', ' ', answer_text).strip() + # Convert HTML with base64 images to markdown + answer_text = self._convert_html_with_base64_images_to_markdown(answer.answer) # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" {letter}. {marker}{answer_text}") if answer.answer_feedback: - feedback_text = answer.answer_feedback - try: - soup = BeautifulSoup(feedback_text, 'html.parser') - feedback_text = soup.get_text(separator=' ', strip=True) - except: - feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() + feedback_text = self._convert_html_with_base64_images_to_markdown(answer.answer_feedback) lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) @@ -1390,10 +1572,12 @@ def _format_true_false_markdown(self, question): # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" a. {true_marker}True") if tf.true_feedback: - lines.append(f" @Feedback: {tf.true_feedback}") + feedback_text = self._convert_html_with_base64_images_to_markdown(tf.true_feedback) + lines.append(f" @Feedback: {feedback_text}") lines.append(f" b. {false_marker}False") if tf.false_feedback: - lines.append(f" @Feedback: {tf.false_feedback}") + feedback_text = self._convert_html_with_base64_images_to_markdown(tf.false_feedback) + lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) def _format_fib_markdown(self, question): @@ -1409,15 +1593,9 @@ def _format_fib_markdown(self, question): for fib in fibs: if fib.type == 'fibquestion': if fib.text: - # Clean HTML tags but preserve spacing - from bs4 import BeautifulSoup - try: - soup = BeautifulSoup(fib.text, 'html.parser') - cleaned_text = soup.get_text(separator=' ', strip=False) - current_text += cleaned_text - except Exception: - # Fallback: use text as-is if BeautifulSoup fails - current_text += fib.text + # Convert HTML with base64 images to markdown, preserving spacing + cleaned_text = self._convert_html_with_base64_images_to_markdown(fib.text) + current_text += cleaned_text elif fib.type == 'fibanswer': # Insert answer in brackets [answer] where the blank should be if fib.text: @@ -1440,22 +1618,12 @@ def _format_multi_select_markdown(self, question): for idx, answer in enumerate(answers, start=1): letter = chr(96 + idx) # a, b, c, etc. marker = "*" if answer.is_correct else "" - # Clean HTML from answer text - answer_text = answer.answer - try: - soup = BeautifulSoup(answer_text, 'html.parser') - answer_text = soup.get_text(separator=' ', strip=True) - except: - answer_text = re.sub(r'\s+', ' ', answer_text).strip() + # Convert HTML with base64 images to markdown + answer_text = self._convert_html_with_base64_images_to_markdown(answer.answer) # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" {letter}. {marker}{answer_text}") if answer.answer_feedback: - feedback_text = answer.answer_feedback - try: - soup = BeautifulSoup(feedback_text, 'html.parser') - feedback_text = soup.get_text(separator=' ', strip=True) - except: - feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() + feedback_text = self._convert_html_with_base64_images_to_markdown(answer.answer_feedback) lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) @@ -1472,15 +1640,15 @@ def _format_matching_markdown(self, question): for idx, choice in enumerate(choices, start=1): letter = chr(96 + idx) # a, b, c, etc. - # Remove block-level HTML tags but preserve inline styling - choice_text = self._remove_block_tags_preserve_inline(choice.choice_text) + # Convert HTML with base64 images to markdown (preserves inline styling and images) + choice_text = self._convert_html_with_base64_images_to_markdown(choice.choice_text) # Use the related manager matching_answers (from ForeignKey in MatchingAnswer) answers = choice.matching_answers.all() if answers: # Get the first matching answer (typically there's one per choice) answer = answers[0] - answer_text = self._remove_block_tags_preserve_inline(answer.answer_text) + answer_text = self._convert_html_with_base64_images_to_markdown(answer.answer_text) # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" {letter}. {choice_text} = {answer_text}") else: @@ -1531,26 +1699,12 @@ def _format_ordering_markdown(self, question): orderings = question.get_orderings() for idx, ordering in enumerate(orderings, start=1): letter = chr(96 + idx) # a, b, c, etc. - # Clean HTML from ordering text - ordering_text = ordering.text - try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(ordering_text, 'html.parser') - ordering_text = soup.get_text(separator=' ', strip=True) - except: - import re - ordering_text = re.sub(r'\s+', ' ', ordering_text).strip() + # Convert HTML with base64 images to markdown + ordering_text = self._convert_html_with_base64_images_to_markdown(ordering.text) # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" {letter}. {ordering_text}") if ordering.ord_feedback: - feedback_text = ordering.ord_feedback - try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(feedback_text, 'html.parser') - feedback_text = soup.get_text(separator=' ', strip=True) - except: - import re - feedback_text = re.sub(r'\s+', ' ', feedback_text).strip() + feedback_text = self._convert_html_with_base64_images_to_markdown(ordering.ord_feedback) lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) @@ -1565,15 +1719,8 @@ def _format_written_response_markdown(self, question): if wr and wr.answer_key: # Add blank line first (double newline for hard paragraph break) lines.append("") - # Clean HTML from answer text - answer_text = wr.answer_key - try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(answer_text, 'html.parser') - answer_text = soup.get_text(separator=' ', strip=True) - except: - import re - answer_text = re.sub(r'\s+', ' ', answer_text).strip() + # Convert HTML with base64 images to markdown + answer_text = self._convert_html_with_base64_images_to_markdown(wr.answer_key) # Indent with regular spaces (3 for label, 7 for answer) to mimic margin # Avoid 4+ leading spaces to prevent markdown list or code block detection lines.append(f"Correct Answer:") diff --git a/api/tasks.py b/api/tasks.py index e02b2d5..abe8063 100644 --- a/api/tasks.py +++ b/api/tasks.py @@ -712,6 +712,7 @@ def run_pandoc_task(questionlibrary_id): emptyparaPath = "./pandoc/pandoc-filters/emptypara.lua" imageFilterPath = "./pandoc/pandoc-filters/image.lua" tables = "./pandoc/pandoc-filters/tables.lua" + linebreakPath = "./pandoc/pandoc-filters/linebreak.lua" # listsPath = "./api/pandoc/pandoc-filters/lists.lua" pandoc_word_to_html = pypandoc.convert_file( @@ -744,6 +745,7 @@ def run_pandoc_task(questionlibrary_id): '--ascii', '--lua-filter=' + mdblockquotePath, '--lua-filter=' + emptyparaPath, + '--lua-filter=' + linebreakPath, # '--lua-filter=' + tables ]) pandoc_html_to_md = pandoc_html_to_md.rstrip() diff --git a/api/views.py b/api/views.py index 5007e8a..e4bc244 100644 --- a/api/views.py +++ b/api/views.py @@ -246,6 +246,9 @@ def post(self, request, format=None): if serializer.is_valid(): instance = serializer.save() + logger.addFilter(QuestionlibraryFilenameFilter(instance)) + logger.info(f"[{instance.id}] SCORM to JSON conversion started") + try: # Step 1: Extract SCORM ZIP and parse XML using XmlReader from .scorm.XmlReader import XmlReader @@ -318,6 +321,9 @@ def post(self, request, format=None): ql_instance.create_directory() ql_instance.save() + logger.addFilter(QuestionlibraryFilenameFilter(ql_instance)) + logger.info(f"[{ql_instance.id}] JSON to DOCX conversion started") + try: # Step 1: Convert Django models to markdown (matching formatter_output format) from .scorm.XmlReader import XmlReader @@ -330,6 +336,67 @@ def post(self, request, format=None): xml_reader = object.__new__(XmlReader) # Create instance without calling __init__ markdown_text = xml_reader.format_to_markdown(ql_instance) + # Extract base64 images from HTML img tags and save as files + # Pandoc doesn't support base64 data URIs when converting markdown to DOCX + # So we need to extract them to files and use file references + import base64 + import uuid + import os + import re as re_module + image_counter = 0 + base64_pattern = r']*?)src=["\'](data:image/([^;]+);base64,([^"\']+))["\']([^>]*?)>' + + def replace_base64_with_file(match): + nonlocal image_counter + before_src = match.group(1) + full_data_uri = match.group(2) + image_type = match.group(3) # png, jpeg, etc. + base64_data = match.group(4) + after_src = match.group(5) + + try: + # Decode base64 image + image_data = base64.b64decode(base64_data) + + # Determine file extension from MIME type + ext_map = { + 'png': 'png', + 'jpeg': 'jpg', + 'jpg': 'jpg', + 'gif': 'gif', + 'svg+xml': 'svg', + 'webp': 'webp' + } + ext = ext_map.get(image_type.lower(), 'png') + + # Save image to temporary file + image_filename = f"image_{image_counter}_{uuid.uuid4().hex[:8]}.{ext}" + image_path = path.join(ql_instance.folder_path, image_filename) + + with open(image_path, 'wb') as img_file: + img_file.write(image_data) + + image_counter += 1 + logger.info(f"Extracted base64 image to file: {image_filename} ({len(image_data)} bytes)") + + # Extract alt text if present + alt_match = re.search(r'alt=["\']([^"\']*)["\']', before_src + after_src) + alt_text = alt_match.group(1) if alt_match else 'image' + + # Use markdown image syntax with relative path (filename only) + # We'll change working directory to folder_path before Pandoc conversion + markdown_image = f'![{alt_text}]({image_filename})' + logger.debug(f"Replacing base64 img tag with markdown: {markdown_image}") + return markdown_image + except Exception as e: + logger.error(f"Error extracting base64 image: {str(e)}") + # Return original if extraction fails + return match.group(0) + + # Replace all base64 img tags with file references + markdown_text = re.sub(base64_pattern, replace_base64_with_file, markdown_text) + logger.info(f"Extracted {image_counter} base64 images to files") + # Step 2: Convert markdown to DOCX using Pandoc (reverse of run_pandoc_task) # Use main_title if it exists, otherwise use filtered_main_title if ql_instance.main_title: @@ -348,55 +415,121 @@ def post(self, request, format=None): # Convert markdown to DOCX # Use similar settings as the forward conversion but in reverse - mdblockquotePath = "./pandoc/pandoc-filters/mdblockquote.lua" - emptyparaPath = "./pandoc/pandoc-filters/emptypara.lua" + # Get absolute paths for lua filters before changing directory + import os as os_module + # Calculate base directory (project root) - views.py is in api/, so go up one level + current_file_dir = os_module.path.dirname(os_module.path.abspath(__file__)) # /code/api + base_dir = os_module.path.dirname(current_file_dir) # /code + mdblockquotePath = os_module.path.join(base_dir, "pandoc", "pandoc-filters", "mdblockquote.lua") + emptyparaPath = os_module.path.join(base_dir, "pandoc", "pandoc-filters", "emptypara.lua") + # Make paths absolute + mdblockquotePath = os_module.path.abspath(mdblockquotePath) + emptyparaPath = os_module.path.abspath(emptyparaPath) + logger.debug(f"Lua filter paths: mdblockquote={mdblockquotePath}, emptypara={emptyparaPath}") # Create temporary markdown file temp_md_path = path.join(ql_instance.folder_path, "temp_markdown.md") with open(temp_md_path, 'w', encoding='utf-8') as f: f.write(markdown_text) + # Log markdown preview and verify image file references + # Check for image file references in markdown + import re as re_module + import glob + # Check for markdown image syntax with file references + file_refs = re_module.findall(r'!\[.*?\]\((image_\d+_[^)]+)\)', markdown_text) + logger.info(f"Found {len(file_refs)} image file references in markdown") + # List image files in the folder and their sizes + image_files = glob.glob(path.join(ql_instance.folder_path, "image_*.*")) + image_info = [] + total_image_size = 0 + for img_file in image_files: + if path.exists(img_file): + img_size = path.getsize(img_file) + total_image_size += img_size + img_size_mb = img_size / (1024 * 1024) + image_info.append(f"{path.basename(img_file)} ({img_size_mb:.2f} MB, {img_size} bytes)") + if len(image_files) > 0: + logger.info(f"Found {len(image_files)} image files in folder:") + for info in image_info: + logger.info(f" - {info}") + logger.info(f"Total image size: {total_image_size / (1024 * 1024):.2f} MB ({total_image_size} bytes)") + logger.info(f"Markdown file created at: {temp_md_path}") + try: - # Convert markdown to DOCX (reverse of DOCX → markdown) - # First convert markdown to HTML (intermediate step like forward conversion) - pandoc_md_to_html = pypandoc.convert_file( - temp_md_path, - format='markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars', - to='html+empty_paragraphs+tex_math_single_backslash', - extra_args=[ - '--no-highlight', - '--embed-resources', - '--markdown-headings=atx', - '--preserve-tabs', - '--wrap=preserve', - '--indent=false', - '--mathml', - '--ascii', - '--lua-filter=' + mdblockquotePath, - '--lua-filter=' + emptyparaPath, + # Convert markdown directly to DOCX + # Images are now file references, so Pandoc should be able to find and embed them + original_cwd = os_module.getcwd() + try: + os_module.chdir(ql_instance.folder_path) + # Use relative path since we changed directory + temp_md_rel_path = "temp_markdown.md" + docx_output_name = os_module.path.basename(docx_path) + logger.info(f"Converting markdown with image file references to DOCX (working dir: {os_module.getcwd()})") + # Verify images exist before conversion + import glob as glob_module + existing_images = glob_module.glob("image_*.*") + logger.info(f"Images in working directory before Pandoc: {existing_images}") + # Verify markdown has image references + with open(temp_md_rel_path, 'r', encoding='utf-8') as f: + md_content = f.read() + image_refs_in_md = re_module.findall(r'!\[.*?\]\((image_\d+_[^)]+)\)', md_content) + logger.info(f"Image references found in markdown file: {image_refs_in_md}") + # Call pandoc directly via subprocess to capture warnings/errors + import subprocess + pandoc_cmd = [ + "pandoc", + temp_md_rel_path, + "-f", + "markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars", + "-t", + "docx+empty_paragraphs", + "-o", + docx_output_name, + "--no-highlight", + "--preserve-tabs", + "--wrap=preserve", + "--indent=false", + "--mathml", + "--ascii", + "--lua-filter=" + mdblockquotePath, + "--lua-filter=" + emptyparaPath, ] - ) - - # Then convert HTML to DOCX - pypandoc.convert_text( - pandoc_md_to_html, - format='html+empty_paragraphs', - to='docx+empty_paragraphs', - outputfile=docx_path, - extra_args=[ - '--no-highlight', - '--preserve-tabs', - '--wrap=preserve', - '--indent=false', - '--mathml', - '--ascii', - ] - ) + logger.info(f"Running pandoc command: {' '.join(pandoc_cmd)}") + result = subprocess.run( + pandoc_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if result.returncode != 0: + logger.error(f"Pandoc failed (exit {result.returncode}): {result.stderr}") + raise Exception(f"Pandoc failed: {result.stderr}") + if result.stderr: + logger.warning(f"Pandoc warnings: {result.stderr}") + logger.info(f"Pandoc markdown to DOCX conversion completed") + finally: + os_module.chdir(original_cwd) finally: # Clean up temporary markdown file if path.exists(temp_md_path): from os import remove remove(temp_md_path) + + # Clean up temporary image files + import glob + image_files = glob.glob(path.join(ql_instance.folder_path, "image_*.png")) + \ + glob.glob(path.join(ql_instance.folder_path, "image_*.jpg")) + \ + glob.glob(path.join(ql_instance.folder_path, "image_*.jpeg")) + \ + glob.glob(path.join(ql_instance.folder_path, "image_*.gif")) + \ + glob.glob(path.join(ql_instance.folder_path, "image_*.svg")) + \ + glob.glob(path.join(ql_instance.folder_path, "image_*.webp")) + for img_file in image_files: + try: + if path.exists(img_file): + remove(img_file) + except Exception as e: + logger.warning(f"Could not remove temporary image file {img_file}: {str(e)}") # Step 3: Return DOCX file from django.core.files import File @@ -406,8 +539,11 @@ def post(self, request, format=None): file_response = FileResponse(ql_instance.temp_file) file_response['Content-Disposition'] = f'attachment; filename="{docx_filename}"' + # Log DOCX file size + docx_size_bytes = path.getsize(docx_path) + docx_size_mb = docx_size_bytes / (1024 * 1024) logger.addFilter(QuestionlibraryFilenameFilter(ql_instance)) - logger.info(f"[{ql_instance.id}] JSON to DOCX conversion completed") + logger.info(f"[{ql_instance.id}] JSON to DOCX conversion completed - DOCX size: {docx_size_mb:.2f} MB ({docx_size_bytes} bytes)") ql_instance.cleanup() diff --git a/pandoc/pandoc-filters/linebreak.lua b/pandoc/pandoc-filters/linebreak.lua new file mode 100644 index 0000000..c9455a5 --- /dev/null +++ b/pandoc/pandoc-filters/linebreak.lua @@ -0,0 +1,11 @@ +-- Replace HTML
tags with proper pandoc LineBreak nodes +return { + RawInline = function(el) + if el.format:match("html") then + local text = el.text:lower() + if text == "
" or text == "
" or text == "
" then + return pandoc.LineBreak() + end + end + end +} diff --git a/restapi/tasks.py b/restapi/tasks.py index 9234cad..09a6545 100644 --- a/restapi/tasks.py +++ b/restapi/tasks.py @@ -20,6 +20,7 @@ def run_pandoc_task(temp_file_path, filename): emptyparaPath = "./pandoc/pandoc-filters/emptypara.lua" imageFilterPath = "./pandoc/pandoc-filters/image.lua" tables = "./pandoc/pandoc-filters/tables.lua" + linebreakPath = "./pandoc/pandoc-filters/linebreak.lua" # listsPath = "./api/pandoc/pandoc-filters/lists.lua" pandoc_word_to_html = pypandoc.convert_file( @@ -52,6 +53,7 @@ def run_pandoc_task(temp_file_path, filename): '--ascii', '--lua-filter=' + mdblockquotePath, '--lua-filter=' + emptyparaPath, + '--lua-filter=' + linebreakPath, # '--lua-filter=' + tables ]) pandoc_html_to_md = pandoc_html_to_md.rstrip() From 007ef28379db0768aebf5566aed17917d3572044 Mon Sep 17 00:00:00 2001 From: Arvin Rolos Date: Mon, 19 Jan 2026 15:21:00 -0800 Subject: [PATCH 4/8] fix section delimiter --- api/scorm/XmlReader.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/api/scorm/XmlReader.py b/api/scorm/XmlReader.py index 11fbc9e..f4ae1b5 100644 --- a/api/scorm/XmlReader.py +++ b/api/scorm/XmlReader.py @@ -1402,8 +1402,8 @@ def format_to_markdown(self, question_library): for section in sections: # Skip root section (is_main_content=True) - don't wrap it with #section markers # Only wrap non-root sections with #section and /section markers - if not section.is_main_content: - lines.append("#section") + # if not section.is_main_content: + # lines.append("#section") # Add section title if present and should be displayed (## for markdown heading) if section.title and section.is_title_displayed: @@ -1414,6 +1414,10 @@ def format_to_markdown(self, question_library): section_title_display = soup.get_text(separator=' ', strip=True) except: section_title_display = re.sub(r'\s+', ' ', section_title_display).strip() + lines.append("") # Add blank lines before #section + lines.append("") + lines.append("#section") + lines.append("") # Add blank line after #section lines.append(f"## {section_title_display}") # Add section text if present and should be displayed @@ -1424,15 +1428,24 @@ def format_to_markdown(self, question_library): # Process questions in this section questions = section.get_questions() - for question in questions: + for idx, question in enumerate(questions): question_markdown = self._format_question_to_markdown(question) lines.append(question_markdown) - lines.append("") # Add blank line between questions + + # Add /section marker after the last question for non-root sections + if not section.is_main_content and idx == len(questions) - 1: + # Last question - add /section right after it + lines.append("") + lines.append("/section") + lines.append("") + elif idx < len(questions) - 1: + # Not the last question - add blank line between questions + lines.append("") - # Close section marker for non-root sections - if not section.is_main_content: + # If section has no questions, still add /section for non-root sections + if not section.is_main_content and len(questions) == 0: lines.append("/section") - lines.append("") # Add blank line after section + lines.append("") # Join with newlines and ensure proper formatting result = "\n".join(lines) From ab7d6700e01b8b55d134e395c3111864342fd525 Mon Sep 17 00:00:00 2001 From: Arvin Rolos Date: Tue, 20 Jan 2026 14:59:28 -0800 Subject: [PATCH 5/8] fix emoji, symbols, and add section text --- api/scorm/XmlReader.py | 109 +++++++++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 37 deletions(-) diff --git a/api/scorm/XmlReader.py b/api/scorm/XmlReader.py index f4ae1b5..bd2a44f 100644 --- a/api/scorm/XmlReader.py +++ b/api/scorm/XmlReader.py @@ -10,6 +10,7 @@ from bs4 import BeautifulSoup import re import base64 +import html from api.models import ( QuestionLibrary, Section, Question, MultipleChoice, MultipleChoiceAnswer, @@ -294,6 +295,7 @@ def _extract_text_from_material(self, material_el): Extract text content from material element, handling CDATA. Automatically cleans CDATA whitespace and HTML tags. Converts SCORM image file paths to base64 data URIs. + Decodes HTML entities (including numeric entities for emojis and symbols). """ text_parts = [] @@ -309,8 +311,10 @@ def _extract_text_from_material(self, material_el): # Also check for CDATA in tail if mattext.tail: raw_text += mattext.tail + # Decode HTML entities (handles &, <, >, 🤣, etc.) + decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata_text(raw_text) + cleaned_text = self._clean_cdata_text(decoded_text) # Convert SCORM image file paths to base64 cleaned_text = self._convert_scorm_images_to_base64(cleaned_text) text_parts.append(cleaned_text) @@ -322,6 +326,7 @@ def _extract_question_text(self, presentation_el): Extract question text from presentation element. Automatically cleans CDATA whitespace and HTML tags. Converts SCORM image file paths to base64 data URIs. + Decodes HTML entities (including numeric entities for emojis and symbols). """ text_parts = [] @@ -335,8 +340,10 @@ def _extract_question_text(self, presentation_el): raw_text = mattext.text if mattext.text else '' if mattext.tail: raw_text += mattext.tail + # Decode HTML entities (handles &, <, >, 🤣, etc.) + decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata_text(raw_text) + cleaned_text = self._clean_cdata_text(decoded_text) # Convert SCORM image file paths to base64 cleaned_text = self._convert_scorm_images_to_base64(cleaned_text) text_parts.append(cleaned_text) @@ -355,14 +362,17 @@ def _extract_text_from_feedback(self, feedback_el): Extract text from feedback element. Automatically cleans CDATA whitespace while preserving HTML tags. Converts SCORM image file paths to base64 data URIs. + Decodes HTML entities (including numeric entities for emojis and symbols). """ material = feedback_el.find('material') if material is not None: mattext = material.find('mattext') if mattext is not None: raw_text = mattext.text if mattext.text else '' + # Decode HTML entities (handles &, <, >, 🤣, etc.) + decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata_text(raw_text) + cleaned_text = self._clean_cdata_text(decoded_text) # Convert SCORM image file paths to base64 return self._convert_scorm_images_to_base64(cleaned_text) return None @@ -557,6 +567,11 @@ def preserve_math(match): result = re.sub(img_pattern, preserve_img_tag, html_text) result = re.sub(math_pattern, preserve_math, result, flags=re.IGNORECASE) + # Replace

and

tags with newlines to preserve paragraph breaks + # Do this before BeautifulSoup processes it to ensure paragraph breaks are preserved + result = re.sub(r'

', '\n', result, flags=re.IGNORECASE) + result = re.sub(r']*>', '\n', result, flags=re.IGNORECASE) + # Convert remaining HTML to plain text using BeautifulSoup try: soup = BeautifulSoup(result, 'html.parser') @@ -586,6 +601,9 @@ def preserve_math(match): # Normalize whitespace on each line but keep explicit newlines and math/img blocks text = text.replace('\r', '') + # Normalize multiple consecutive newlines (from paragraph breaks) to double newlines + # This ensures paragraphs are separated by blank lines in markdown + text = re.sub(r'\n{3,}', '\n\n', text) # 3+ newlines -> 2 newlines normalized_lines = [] for line in text.split('\n'): stripped = line.strip() @@ -649,8 +667,10 @@ def _parse_multiple_choice(self, item_el, question_ident): answer_text = '' if mattext is not None: raw_text = mattext.text if mattext.text else '' + # Decode HTML entities (handles &, <, >, 🤣, etc.) + decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - answer_text = self._clean_cdata_text(raw_text) + answer_text = self._clean_cdata_text(decoded_text) # Find weight from resprocessing weight = 0.0 @@ -806,8 +826,10 @@ def _parse_fill_in_the_blanks(self, item_el, question_ident): mattext = child.find('mattext') text = '' if mattext is not None: - # Don't clean CDATA for FIB - preserve original spacing - text = mattext.text if mattext.text else '' + raw_text = mattext.text if mattext.text else '' + # Decode HTML entities (handles &, <, >, 🤣, etc.) + # Don't clean CDATA for FIB - preserve original spacing, but decode entities + text = html.unescape(raw_text) fib_data['fibs'].append({ 'type': 'fibquestion', @@ -902,8 +924,10 @@ def _parse_multi_select(self, item_el, question_ident): answer_text = '' if mattext is not None: raw_text = mattext.text if mattext.text else '' + # Decode HTML entities (handles &, <, >, 🤣, etc.) + decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - answer_text = self._clean_cdata_text(raw_text) + answer_text = self._clean_cdata_text(decoded_text) # Determine if correct from resprocessing is_correct = False @@ -1000,8 +1024,10 @@ def _parse_matching(self, item_el, question_ident): mattext = material.find('mattext') if mattext is not None: raw_text = mattext.text if mattext.text else '' + # Decode HTML entities (handles &, <, >, 🤣, etc.) + decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - choice_text = self._clean_cdata_text(raw_text) + choice_text = self._clean_cdata_text(decoded_text) # Find correct answer from resprocessing correct_answer_ident = None @@ -1068,8 +1094,10 @@ def _parse_ordering(self, item_el, question_ident): text = '' if mattext is not None: raw_text = mattext.text if mattext.text else '' + # Decode HTML entities (handles &, <, >, 🤣, etc.) + decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - text = self._clean_cdata_text(raw_text) + text = self._clean_cdata_text(decoded_text) # Find feedback ord_feedback = None @@ -1131,8 +1159,10 @@ def _parse_written_response(self, item_el, question_ident): mattext = initial_text_mat.find('.//mattext') if mattext is not None: raw_text = mattext.text if mattext.text else '' + # Decode HTML entities (handles &, <, >, 🤣, etc.) + decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata_text(raw_text) + cleaned_text = self._clean_cdata_text(decoded_text) wr_data['initial_text'] = cleaned_text if cleaned_text else None return wr_data @@ -1400,28 +1430,33 @@ def format_to_markdown(self, question_library): # Process sections sections = question_library.get_sections() for section in sections: - # Skip root section (is_main_content=True) - don't wrap it with #section markers - # Only wrap non-root sections with #section and /section markers - # if not section.is_main_content: - # lines.append("#section") + # For main content sections (is_main_content=True), skip #section markers and section title + # The main title is already displayed as H1 above + if not section.is_main_content: + # Add section title if present and should be displayed (## for markdown heading) + if section.title and section.is_title_displayed: + # Clean HTML from section title for display + section_title_display = section.title + try: + soup = BeautifulSoup(section_title_display, 'html.parser') + section_title_display = soup.get_text(separator=' ', strip=True) + except: + section_title_display = re.sub(r'\s+', ' ', section_title_display).strip() + lines.append("") + lines.append("
") + lines.append("#section") + lines.append(f"## {section_title_display}") - # Add section title if present and should be displayed (## for markdown heading) - if section.title and section.is_title_displayed: - # Clean HTML from section title for display - section_title_display = section.title - try: - soup = BeautifulSoup(section_title_display, 'html.parser') - section_title_display = soup.get_text(separator=' ', strip=True) - except: - section_title_display = re.sub(r'\s+', ' ', section_title_display).strip() - lines.append("") # Add blank lines before #section - lines.append("") - lines.append("#section") - lines.append("") # Add blank line after #section - lines.append(f"## {section_title_display}") + # Add section text if present + # For main content sections: only display if is_text_displayed is true + # For non-main-content sections: always display if text exists (regardless of is_text_displayed) + should_display_text = False + if section.is_main_content: + should_display_text = section.text and section.is_text_displayed + else: + should_display_text = bool(section.text) # Display if text exists - # Add section text if present and should be displayed - if section.text and section.is_text_displayed: + if should_display_text: # Convert HTML with base64 images to markdown section_text = self._convert_html_with_base64_images_to_markdown(section.text) lines.append(section_text) @@ -1432,20 +1467,18 @@ def format_to_markdown(self, question_library): question_markdown = self._format_question_to_markdown(question) lines.append(question_markdown) - # Add /section marker after the last question for non-root sections + # Add /section marker after the last question for non-main-content sections if not section.is_main_content and idx == len(questions) - 1: # Last question - add /section right after it lines.append("") + lines.append("
") lines.append("/section") - lines.append("") - elif idx < len(questions) - 1: - # Not the last question - add blank line between questions - lines.append("") - # If section has no questions, still add /section for non-root sections + # If section has no questions, still add /section for non-main-content sections if not section.is_main_content and len(questions) == 0: - lines.append("/section") lines.append("") + lines.append("
") + lines.append("/section") # Join with newlines and ensure proper formatting result = "\n".join(lines) @@ -1464,6 +1497,8 @@ def _format_question_to_markdown(self, question): # Question header: Type, Title, Points (each on separate line) # Each header on its own line if question.questiontype: + lines.append("") + lines.append("
") lines.append(f"Type: {question.questiontype}") if question.title: lines.append(f"Title: {question.title}") From a3a2eeb7b146b34933aee9d03a802471be1c941a Mon Sep 17 00:00:00 2001 From: Arvin Rolos Date: Wed, 21 Jan 2026 09:22:49 -0800 Subject: [PATCH 6/8] add root section main_text --- api/models.py | 1 + api/process/formatter.py | 29 ++++++++++++++++++++++++++--- api/process/sectioner.py | 3 +++ api/scorm/XmlReader.py | 19 ++++++++++++++++++- api/serializers.py | 4 ++-- 5 files changed, 50 insertions(+), 6 deletions(-) diff --git a/api/models.py b/api/models.py index 2b41e69..8e4ee64 100644 --- a/api/models.py +++ b/api/models.py @@ -49,6 +49,7 @@ class QuestionLibrary(models.Model): image_path = models.FilePathField(path=None, match=None, recursive=False, max_length=None) shuffle = models.BooleanField(blank=True, null=True) main_title = models.TextField(blank=True, null=True) + main_text = models.TextField(blank=True, null=True) filtered_main_title = models.TextField(blank=True, null=True) end_answers_raw = models.TextField(blank=True, null=True) formatter_error = models.TextField(blank=True, null=True) diff --git a/api/process/formatter.py b/api/process/formatter.py index 5a1af05..a451d52 100644 --- a/api/process/formatter.py +++ b/api/process/formatter.py @@ -32,9 +32,32 @@ def run_formatter(questionlibrary): maincontenttitle = root.find('maincontent_title') logger.debug("checking maincontent title") if maincontenttitle is not None: - main_title = (maincontenttitle.text).strip() - if main_title: - questionlibrary.main_title = (trim_text(main_title)).lstrip('# ') + raw_main = (maincontenttitle.text or "").strip() + if raw_main: + # Use the first H1 line as the title; remaining lines become root-level text + main_lines = raw_main.splitlines() + title_index = None + for idx, line in enumerate(main_lines): + if line.lstrip().startswith('#'): + title_index = idx + break + + if title_index is not None: + main_title = main_lines[title_index].strip() + main_title = (trim_text(main_title)).lstrip('# ').strip() + main_text_lines = main_lines[title_index + 1:] + else: + # Fallback: treat the first line as title if no H1 is found + main_title = (trim_text(main_lines[0])).lstrip('# ').strip() + main_text_lines = main_lines[1:] + + main_text = "\n".join(main_text_lines).strip() + + if main_title: + questionlibrary.main_title = main_title + if main_text: + # Preserve raw markdown for root-level text + questionlibrary.main_text = main_text questionlibrary.save() # ==================================== BODY diff --git a/api/process/sectioner.py b/api/process/sectioner.py index 05bbb84..0856d3b 100644 --- a/api/process/sectioner.py +++ b/api/process/sectioner.py @@ -60,6 +60,9 @@ def run_sectioner(questionlibrary): sectionobject.raw_content = maincontent.text sectionobject.is_main_content = True sectionobject.title = questionlibrary.main_title + if questionlibrary.main_text: + sectionobject.text = markdown_to_html(questionlibrary.main_text) + sectionobject.is_text_displayed = True sectiontext = section.find('sectiontext') if sectiontext is not None: diff --git a/api/scorm/XmlReader.py b/api/scorm/XmlReader.py index bd2a44f..c5b1096 100644 --- a/api/scorm/XmlReader.py +++ b/api/scorm/XmlReader.py @@ -1426,6 +1426,11 @@ def format_to_markdown(self, question_library): main_title = re.sub(r'\s+', ' ', main_title).strip() lines.append(f"# {main_title}") lines.append("") # Add blank line after title + + # Add root-level text (main_text) if present + if getattr(question_library, "main_text", None): + lines.append(question_library.main_text) + lines.append("") # Process sections sections = question_library.get_sections() @@ -1494,7 +1499,7 @@ def _format_question_to_markdown(self, question): """ lines = [] - # Question header: Type, Title, Points (each on separate line) + # Question header: Type, Title, Points, Randomize (each on separate line) # Each header on its own line if question.questiontype: lines.append("") @@ -1506,6 +1511,18 @@ def _format_question_to_markdown(self, question): # Normalize points: remove trailing zeros and decimal if not needed (e.g., 1.0000 -> 1, 1.5 -> 1.5) normalized_points = str(float(question.points)).rstrip('0').rstrip('.') lines.append(f"Points: {normalized_points}") + # Add Randomize if set on MC/MS question types (mirrors docx -> json randomize parsing) + randomize_value = None + if question.questiontype == 'MC': + mc = question.get_multiple_choice() + if mc and mc.randomize is not None: + randomize_value = mc.randomize + elif question.questiontype == 'MS': + ms = question.get_multiple_select() + if ms and ms.randomize is not None: + randomize_value = ms.randomize + if randomize_value is True: + lines.append("Randomize: yes") # Add question text (HTML format from SCORM, convert to markdown preserving base64 images) # Prefix with question number if available (e.g., "1. Question text") diff --git a/api/serializers.py b/api/serializers.py index acea5aa..5e11dcd 100644 --- a/api/serializers.py +++ b/api/serializers.py @@ -286,7 +286,7 @@ def get_sections(self, questionlibrary): return serializer.data class Meta: model = QuestionLibrary - fields = ['main_title', 'randomize_answer', 'enumeration', 'media_folder', 'sections'] + fields = ['main_title', 'main_text', 'randomize_answer', 'enumeration', 'media_folder', 'sections'] ############################## `/package` serializers ############################## @@ -343,7 +343,7 @@ class QuestionLibraryPackageSerializer(serializers.ModelSerializer): class Meta: model = QuestionLibrary - fields = ['main_title', 'randomize_answer', 'enumeration', 'media_folder', 'formatter_output', 'sectioner_output', 'sections'] + fields = ['main_title', 'main_text', 'randomize_answer', 'enumeration', 'media_folder', 'formatter_output', 'sectioner_output', 'sections'] def create(self, validated_data): sections_data = validated_data.pop('sections') From fb7ca1df69c9c775192a71fec95c86873f7e9946 Mon Sep 17 00:00:00 2001 From: Arvin Rolos Date: Thu, 22 Jan 2026 08:42:17 -0800 Subject: [PATCH 7/8] main text and renames --- api/scorm/XmlReader.py | 94 +++++++++++++++++++++--------------------- api/scorm/XmlWriter.py | 2 +- api/serializers.py | 14 +++++++ 3 files changed, 63 insertions(+), 47 deletions(-) diff --git a/api/scorm/XmlReader.py b/api/scorm/XmlReader.py index c5b1096..12e2516 100644 --- a/api/scorm/XmlReader.py +++ b/api/scorm/XmlReader.py @@ -177,7 +177,7 @@ def _parse_section(self, section_el): # Parse presentation material (section text) presentation_material = section_el.find('presentation_material') if presentation_material is not None: - text = self._extract_text_from_material(presentation_material) + text = self._extract_material_text(presentation_material) section_data['text'] = text # Parse sectionproc_extension @@ -254,14 +254,14 @@ def _parse_question(self, item_el): # Parse hint hint_el = item_el.find('hint') if hint_el is not None: - question_data['hint'] = self._extract_text_from_hint(hint_el) + question_data['hint'] = self._extract_hint_text(hint_el) # Parse general feedback feedback_els = item_el.findall('itemfeedback') for feedback_el in feedback_els: # General feedback typically has ident matching the question label if feedback_el.get('ident') == question_data['label']: - question_data['feedback'] = self._extract_text_from_feedback(feedback_el) + question_data['feedback'] = self._extract_feedback_text(feedback_el) # Parse question-specific data based on type question_type = question_data['question_type'] @@ -290,7 +290,7 @@ def _parse_question(self, item_el): return question_data - def _extract_text_from_material(self, material_el): + def _extract_material_text(self, material_el): """ Extract text content from material element, handling CDATA. Automatically cleans CDATA whitespace and HTML tags. @@ -314,9 +314,9 @@ def _extract_text_from_material(self, material_el): # Decode HTML entities (handles &, <, >, 🤣, etc.) decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata_text(decoded_text) + cleaned_text = self._clean_cdata(decoded_text) # Convert SCORM image file paths to base64 - cleaned_text = self._convert_scorm_images_to_base64(cleaned_text) + cleaned_text = self._inline_scorm_images(cleaned_text) text_parts.append(cleaned_text) return ''.join(text_parts) @@ -343,21 +343,21 @@ def _extract_question_text(self, presentation_el): # Decode HTML entities (handles &, <, >, 🤣, etc.) decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata_text(decoded_text) + cleaned_text = self._clean_cdata(decoded_text) # Convert SCORM image file paths to base64 - cleaned_text = self._convert_scorm_images_to_base64(cleaned_text) + cleaned_text = self._inline_scorm_images(cleaned_text) text_parts.append(cleaned_text) return ''.join(text_parts) - def _extract_text_from_hint(self, hint_el): + def _extract_hint_text(self, hint_el): """Extract text from hint element.""" hintmaterial = hint_el.find('hintmaterial') if hintmaterial is not None: - return self._extract_text_from_material(hintmaterial) + return self._extract_material_text(hintmaterial) return None - def _extract_text_from_feedback(self, feedback_el): + def _extract_feedback_text(self, feedback_el): """ Extract text from feedback element. Automatically cleans CDATA whitespace while preserving HTML tags. @@ -372,12 +372,12 @@ def _extract_text_from_feedback(self, feedback_el): # Decode HTML entities (handles &, <, >, 🤣, etc.) decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata_text(decoded_text) + cleaned_text = self._clean_cdata(decoded_text) # Convert SCORM image file paths to base64 - return self._convert_scorm_images_to_base64(cleaned_text) + return self._inline_scorm_images(cleaned_text) return None - def _clean_cdata_text(self, text): + def _clean_cdata(self, text): """ Clean text extracted from CDATA sections in SCORM XML. @@ -413,7 +413,7 @@ def _clean_cdata_text(self, text): cleaned = re.sub(r'\s+', ' ', text).strip() return cleaned - def _convert_scorm_images_to_base64(self, html_text): + def _inline_scorm_images(self, html_text): """ Convert SCORM image file paths to base64 data URIs in HTML text. @@ -510,7 +510,7 @@ def replace_image(match): result = re.sub(img_pattern, replace_image, html_text) return result - def _convert_html_with_base64_images_to_markdown(self, html_text): + def _html_to_markdown(self, html_text): """ Convert HTML text with base64 images to markdown format. @@ -670,7 +670,7 @@ def _parse_multiple_choice(self, item_el, question_ident): # Decode HTML entities (handles &, <, >, 🤣, etc.) decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - answer_text = self._clean_cdata_text(decoded_text) + answer_text = self._clean_cdata(decoded_text) # Find weight from resprocessing weight = 0.0 @@ -696,7 +696,7 @@ def _parse_multiple_choice(self, item_el, question_ident): feedback_ident = displayfeedback.get('linkrefid', '') feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") if feedback_el is not None: - answer_feedback = self._extract_text_from_feedback(feedback_el) + answer_feedback = self._extract_feedback_text(feedback_el) mc_data['answers'].append({ 'answer': answer_text, @@ -781,7 +781,7 @@ def _parse_true_false(self, item_el, question_ident): feedback_ident = displayfeedback.get('linkrefid', '') feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") if feedback_el is not None: - tf_data['true_feedback'] = self._extract_text_from_feedback(feedback_el) + tf_data['true_feedback'] = self._extract_feedback_text(feedback_el) elif false_ident and answer_ident == false_ident: setvar = respcondition.find('setvar') @@ -797,7 +797,7 @@ def _parse_true_false(self, item_el, question_ident): feedback_ident = displayfeedback.get('linkrefid', '') feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") if feedback_el is not None: - tf_data['false_feedback'] = self._extract_text_from_feedback(feedback_el) + tf_data['false_feedback'] = self._extract_feedback_text(feedback_el) return tf_data @@ -927,7 +927,7 @@ def _parse_multi_select(self, item_el, question_ident): # Decode HTML entities (handles &, <, >, 🤣, etc.) decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - answer_text = self._clean_cdata_text(decoded_text) + answer_text = self._clean_cdata(decoded_text) # Determine if correct from resprocessing is_correct = False @@ -952,7 +952,7 @@ def _parse_multi_select(self, item_el, question_ident): feedback_ident = displayfeedback.get('linkrefid', '') feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") if feedback_el is not None: - answer_feedback = self._extract_text_from_feedback(feedback_el) + answer_feedback = self._extract_feedback_text(feedback_el) ms_data['answers'].append({ 'answer': answer_text, @@ -1009,7 +1009,7 @@ def _parse_matching(self, item_el, question_ident): if mattext is not None: raw_text = mattext.text if mattext.text else '' # Clean CDATA whitespace while preserving HTML tags - answer_text = self._clean_cdata_text(raw_text) + answer_text = self._clean_cdata(raw_text) if answer_text and answer_ident not in matching_answers: matching_answers[answer_ident] = answer_text @@ -1027,7 +1027,7 @@ def _parse_matching(self, item_el, question_ident): # Decode HTML entities (handles &, <, >, 🤣, etc.) decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - choice_text = self._clean_cdata_text(decoded_text) + choice_text = self._clean_cdata(decoded_text) # Find correct answer from resprocessing correct_answer_ident = None @@ -1097,7 +1097,7 @@ def _parse_ordering(self, item_el, question_ident): # Decode HTML entities (handles &, <, >, 🤣, etc.) decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - text = self._clean_cdata_text(decoded_text) + text = self._clean_cdata(decoded_text) # Find feedback ord_feedback = None @@ -1105,7 +1105,7 @@ def _parse_ordering(self, item_el, question_ident): feedback_ident = question_ident_feedback + str(order_index) feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") if feedback_el is not None: - ord_feedback = self._extract_text_from_feedback(feedback_el) + ord_feedback = self._extract_feedback_text(feedback_el) ord_data['items'].append({ 'text': text, @@ -1149,7 +1149,7 @@ def _parse_written_response(self, item_el, question_ident): if mattext is not None: raw_text = mattext.text if mattext.text else '' # Clean CDATA whitespace while preserving HTML tags - wr_data['answer_key'] = self._clean_cdata_text(raw_text) + wr_data['answer_key'] = self._clean_cdata(raw_text) # Parse initial_text (if present) initial_text_el = item_el.find('initial_text') @@ -1162,7 +1162,7 @@ def _parse_written_response(self, item_el, question_ident): # Decode HTML entities (handles &, <, >, 🤣, etc.) decoded_text = html.unescape(raw_text) # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata_text(decoded_text) + cleaned_text = self._clean_cdata(decoded_text) wr_data['initial_text'] = cleaned_text if cleaned_text else None return wr_data @@ -1429,7 +1429,9 @@ def format_to_markdown(self, question_library): # Add root-level text (main_text) if present if getattr(question_library, "main_text", None): - lines.append(question_library.main_text) + # Convert HTML (including embedded images) to markdown for SCORM output + main_text = self._html_to_markdown(question_library.main_text) + lines.append(main_text) lines.append("") # Process sections @@ -1463,7 +1465,7 @@ def format_to_markdown(self, question_library): if should_display_text: # Convert HTML with base64 images to markdown - section_text = self._convert_html_with_base64_images_to_markdown(section.text) + section_text = self._html_to_markdown(section.text) lines.append(section_text) # Process questions in this section @@ -1529,7 +1531,7 @@ def _format_question_to_markdown(self, question): # Note: For FIB questions, skip displaying question.text here since FIB formatting includes all text parts if question.text and question.questiontype != 'FIB': # Convert HTML with base64 images to markdown - question_text = self._convert_html_with_base64_images_to_markdown(question.text) + question_text = self._html_to_markdown(question.text) # Extract plain text for numbering (remove markdown formatting) plain_text = re.sub(r'!\[.*?\]\([^)]+\)', '', question_text) # Remove image markdown @@ -1591,12 +1593,12 @@ def _format_question_to_markdown(self, question): # Add hint if present (format: @Hint: or @HINT:) if question.hint: - hint_text = self._convert_html_with_base64_images_to_markdown(question.hint) + hint_text = self._html_to_markdown(question.hint) lines.append(f"@Hint: {hint_text}") # Add feedback if present (format: @Feedback: or @FEEDBACK:) if question.feedback: - feedback_text = self._convert_html_with_base64_images_to_markdown(question.feedback) + feedback_text = self._html_to_markdown(question.feedback) lines.append(f"@Feedback: {feedback_text}") # Use double newlines so each logical line becomes a paragraph (hard breaks, not soft) @@ -1616,11 +1618,11 @@ def _format_multiple_choice_markdown(self, question): # Correct answer has * before the letter (weight > 0) marker = "*" if answer.weight and answer.weight > 0 else "" # Convert HTML with base64 images to markdown - answer_text = self._convert_html_with_base64_images_to_markdown(answer.answer) + answer_text = self._html_to_markdown(answer.answer) # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" {letter}. {marker}{answer_text}") if answer.answer_feedback: - feedback_text = self._convert_html_with_base64_images_to_markdown(answer.answer_feedback) + feedback_text = self._html_to_markdown(answer.answer_feedback) lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) @@ -1637,11 +1639,11 @@ def _format_true_false_markdown(self, question): # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" a. {true_marker}True") if tf.true_feedback: - feedback_text = self._convert_html_with_base64_images_to_markdown(tf.true_feedback) + feedback_text = self._html_to_markdown(tf.true_feedback) lines.append(f" @Feedback: {feedback_text}") lines.append(f" b. {false_marker}False") if tf.false_feedback: - feedback_text = self._convert_html_with_base64_images_to_markdown(tf.false_feedback) + feedback_text = self._html_to_markdown(tf.false_feedback) lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) @@ -1659,7 +1661,7 @@ def _format_fib_markdown(self, question): if fib.type == 'fibquestion': if fib.text: # Convert HTML with base64 images to markdown, preserving spacing - cleaned_text = self._convert_html_with_base64_images_to_markdown(fib.text) + cleaned_text = self._html_to_markdown(fib.text) current_text += cleaned_text elif fib.type == 'fibanswer': # Insert answer in brackets [answer] where the blank should be @@ -1684,11 +1686,11 @@ def _format_multi_select_markdown(self, question): letter = chr(96 + idx) # a, b, c, etc. marker = "*" if answer.is_correct else "" # Convert HTML with base64 images to markdown - answer_text = self._convert_html_with_base64_images_to_markdown(answer.answer) + answer_text = self._html_to_markdown(answer.answer) # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" {letter}. {marker}{answer_text}") if answer.answer_feedback: - feedback_text = self._convert_html_with_base64_images_to_markdown(answer.answer_feedback) + feedback_text = self._html_to_markdown(answer.answer_feedback) lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) @@ -1706,14 +1708,14 @@ def _format_matching_markdown(self, question): letter = chr(96 + idx) # a, b, c, etc. # Convert HTML with base64 images to markdown (preserves inline styling and images) - choice_text = self._convert_html_with_base64_images_to_markdown(choice.choice_text) + choice_text = self._html_to_markdown(choice.choice_text) # Use the related manager matching_answers (from ForeignKey in MatchingAnswer) answers = choice.matching_answers.all() if answers: # Get the first matching answer (typically there's one per choice) answer = answers[0] - answer_text = self._convert_html_with_base64_images_to_markdown(answer.answer_text) + answer_text = self._html_to_markdown(answer.answer_text) # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" {letter}. {choice_text} = {answer_text}") else: @@ -1721,7 +1723,7 @@ def _format_matching_markdown(self, question): lines.append(f" {letter}. {choice_text} =") return "\n".join(lines) - def _remove_block_tags_preserve_inline(self, html_text): + def _strip_block_tags(self, html_text): """ Remove block-level HTML tags (p, div, etc.) but preserve inline styling tags (strong, em, b, i, etc.). This allows formatting like bold/italic to be preserved while removing tags that cause line breaks. @@ -1765,11 +1767,11 @@ def _format_ordering_markdown(self, question): for idx, ordering in enumerate(orderings, start=1): letter = chr(96 + idx) # a, b, c, etc. # Convert HTML with base64 images to markdown - ordering_text = self._convert_html_with_base64_images_to_markdown(ordering.text) + ordering_text = self._html_to_markdown(ordering.text) # Indent as level 2 list (4 spaces for markdown level 2) lines.append(f" {letter}. {ordering_text}") if ordering.ord_feedback: - feedback_text = self._convert_html_with_base64_images_to_markdown(ordering.ord_feedback) + feedback_text = self._html_to_markdown(ordering.ord_feedback) lines.append(f" @Feedback: {feedback_text}") return "\n".join(lines) @@ -1785,7 +1787,7 @@ def _format_written_response_markdown(self, question): # Add blank line first (double newline for hard paragraph break) lines.append("") # Convert HTML with base64 images to markdown - answer_text = self._convert_html_with_base64_images_to_markdown(wr.answer_key) + answer_text = self._html_to_markdown(wr.answer_key) # Indent with regular spaces (3 for label, 7 for answer) to mimic margin # Avoid 4+ leading spaces to prevent markdown list or code block detection lines.append(f"Correct Answer:") diff --git a/api/scorm/XmlWriter.py b/api/scorm/XmlWriter.py index 338b674..5f7626e 100644 --- a/api/scorm/XmlWriter.py +++ b/api/scorm/XmlWriter.py @@ -35,7 +35,7 @@ def __init__(self, question_library): if question_library.shuffle is True: self.create_section_shuffle(base_section_el) - self.create_presentation_material(base_section_el, "") # we currently not catching any base section text and it's ignored in ANTLR + self.create_presentation_material(base_section_el, question_library.main_text) # include root-level text when present sec_proc = ET.SubElement(base_section_el, "sectionproc_extension") sec_proc_dis_name = ET.SubElement(sec_proc, "d2l_2p0:display_section_name") diff --git a/api/serializers.py b/api/serializers.py index 5e11dcd..7cf4271 100644 --- a/api/serializers.py +++ b/api/serializers.py @@ -5,6 +5,7 @@ from rest_framework import serializers from .models import Matching, MatchingAnswer, MatchingChoice, Ordering, QuestionLibrary, Section, Question, MultipleChoice, MultipleChoiceAnswer, TrueFalse, Fib, MultipleSelect, MultipleSelectAnswer, WrittenResponse from django.conf import settings +from .process.process_helper import trim_md_to_html def validate_docx_file(value): @@ -279,11 +280,17 @@ class Meta: class JsonResponseSerializer(serializers.ModelSerializer): # sections = SectionSerializer(many=True, read_only=True) sections = serializers.SerializerMethodField() + main_text = serializers.SerializerMethodField() def get_sections(self, questionlibrary): section_queryset = questionlibrary.get_sections() serializer = SectionSerializer(instance=section_queryset, many=True) return serializer.data + + def get_main_text(self, questionlibrary): + if not questionlibrary.main_text: + return questionlibrary.main_text + return trim_md_to_html(questionlibrary.main_text) class Meta: model = QuestionLibrary fields = ['main_title', 'main_text', 'randomize_answer', 'enumeration', 'media_folder', 'sections'] @@ -340,11 +347,18 @@ class Meta: class QuestionLibraryPackageSerializer(serializers.ModelSerializer): sections = SectionPackageSerializer(many=True, allow_null=True) + main_text = serializers.CharField(required=False, allow_null=True) class Meta: model = QuestionLibrary fields = ['main_title', 'main_text', 'randomize_answer', 'enumeration', 'media_folder', 'formatter_output', 'sectioner_output', 'sections'] + def to_representation(self, instance): + data = super().to_representation(instance) + if data.get('main_text'): + data['main_text'] = trim_md_to_html(data['main_text']) + return data + def create(self, validated_data): sections_data = validated_data.pop('sections') question_library_instance = QuestionLibrary.objects.create(**validated_data) From 78642b21495ebc0dd99d2e8e3ed1651b82c43f06 Mon Sep 17 00:00:00 2001 From: Arvin Rolos Date: Mon, 26 Jan 2026 12:06:01 -0800 Subject: [PATCH 8/8] api folder restructuring --- Dockerfile | 1 - api/consumers.py | 332 +-- api/formats/__init__.py | 1 + api/formats/docx/__init__.py | 1 + api/{process => formats/docx}/convert_txt.py | 0 api/{process => formats/docx}/endanswers.py | 2 +- .../docx}/extract_images.py | 2 +- .../docx}/fix_numbering.py | 0 api/{process => formats/docx}/formatter.py | 0 api/{process => formats/docx}/parser.py | 2 +- .../docx}/process_helper.py | 0 api/{process => formats/docx}/sectioner.py | 2 +- api/{process => formats/docx}/splitter.py | 4 +- api/formats/scorm/__init__.py | 1 + api/formats/scorm/manifest.py | 79 + api/formats/scorm/manifest_builder.py | 29 + api/formats/scorm/scorm_extractor.py | 255 +++ api/formats/scorm/scorm_formatter.py | 415 ++++ api/formats/scorm/scorm_parser.py | 845 ++++++++ api/formats/scorm/scorm_question_builder.py | 23 + api/formats/scorm/scorm_unzipper.py | 30 + api/formats/scorm/scorm_writer.py | 141 ++ api/formats/scorm/xml_builders/__init__.py | 19 + api/formats/scorm/xml_builders/base.py | 57 + api/formats/scorm/xml_builders/fib.py | 71 + api/formats/scorm/xml_builders/matching.py | 122 ++ .../scorm/xml_builders/multi_select.py | 77 + .../scorm/xml_builders/multiple_choice.py | 66 + api/formats/scorm/xml_builders/ordering.py | 86 + api/formats/scorm/xml_builders/true_false.py | 72 + .../scorm/xml_builders/written_response.py | 56 + api/{ => formats}/scorm/xmlcdata.py | 9 +- api/models.py | 50 +- api/pipelines/__init__.py | 1 + api/pipelines/docx_to_json.py | 36 + api/pipelines/json_to_docx.py | 239 +++ api/pipelines/json_to_scorm.py | 66 + api/pipelines/response_payload.py | 141 ++ api/pipelines/scorm_to_json.py | 50 + api/pipelines/ws_pipeline.py | 142 ++ api/process/process.py | 127 -- api/questions/__init__.py | 1 + api/questions/model_builders/__init__.py | 1 + .../model_builders}/fib.py | 2 +- .../model_builders}/matching.py | 2 +- .../model_builders}/multiplechoice.py | 2 +- .../model_builders}/multipleselect.py | 6 +- .../model_builders}/ordering.py | 2 +- .../model_builders}/truefalse.py | 2 +- .../model_builders}/writtenresponse.py | 2 +- api/scorm/XmlReader.py | 1841 ----------------- api/scorm/XmlWriter.py | 755 ------- api/scorm/manifest.py | 22 - api/serializers.py | 10 +- api/tasks.py | 16 +- api/urls.py | 4 +- api/views.py | 577 +----- qcon/settings.py | 1 - qcon/urls.py | 2 - restapi/__init__.py | 0 restapi/admin.py | 3 - restapi/apps.py | 19 - restapi/logging/ErrorTypes.py | 176 -- restapi/logging/WarningTypes.py | 61 - restapi/logging/contextfilter.py | 25 - restapi/logging/logging_adapter.py | 22 - restapi/models.py | 818 -------- restapi/process/common/extract_images.py | 29 - restapi/process/common/process_helper.py | 66 - restapi/process/common/restore_images.py | 26 - restapi/process/endanswers.py | 45 - restapi/process/formatter/convert_txt.py | 42 - restapi/process/formatter/fix_numbering.py | 92 - restapi/process/formatter/formatter.py | 126 -- .../process/questionparser/questionparser.py | 141 -- restapi/process/sectioner/sectioner.py | 82 - restapi/process/splitter/splitter.py | 162 -- restapi/serializers.py | 58 - restapi/tasks.py | 64 - restapi/tests.py | 3 - restapi/urls.py | 15 - restapi/views.py | 131 -- 82 files changed, 3400 insertions(+), 5706 deletions(-) create mode 100644 api/formats/__init__.py create mode 100644 api/formats/docx/__init__.py rename api/{process => formats/docx}/convert_txt.py (100%) rename api/{process => formats/docx}/endanswers.py (97%) rename api/{process => formats/docx}/extract_images.py (97%) rename api/{process => formats/docx}/fix_numbering.py (100%) rename api/{process => formats/docx}/formatter.py (100%) rename api/{process => formats/docx}/parser.py (98%) rename api/{process => formats/docx}/process_helper.py (100%) rename api/{process => formats/docx}/sectioner.py (99%) rename api/{process => formats/docx}/splitter.py (99%) create mode 100644 api/formats/scorm/__init__.py create mode 100644 api/formats/scorm/manifest.py create mode 100644 api/formats/scorm/manifest_builder.py create mode 100644 api/formats/scorm/scorm_extractor.py create mode 100644 api/formats/scorm/scorm_formatter.py create mode 100644 api/formats/scorm/scorm_parser.py create mode 100644 api/formats/scorm/scorm_question_builder.py create mode 100644 api/formats/scorm/scorm_unzipper.py create mode 100644 api/formats/scorm/scorm_writer.py create mode 100644 api/formats/scorm/xml_builders/__init__.py create mode 100644 api/formats/scorm/xml_builders/base.py create mode 100644 api/formats/scorm/xml_builders/fib.py create mode 100644 api/formats/scorm/xml_builders/matching.py create mode 100644 api/formats/scorm/xml_builders/multi_select.py create mode 100644 api/formats/scorm/xml_builders/multiple_choice.py create mode 100644 api/formats/scorm/xml_builders/ordering.py create mode 100644 api/formats/scorm/xml_builders/true_false.py create mode 100644 api/formats/scorm/xml_builders/written_response.py rename api/{ => formats}/scorm/xmlcdata.py (81%) create mode 100644 api/pipelines/__init__.py create mode 100644 api/pipelines/docx_to_json.py create mode 100644 api/pipelines/json_to_docx.py create mode 100644 api/pipelines/json_to_scorm.py create mode 100644 api/pipelines/response_payload.py create mode 100644 api/pipelines/scorm_to_json.py create mode 100644 api/pipelines/ws_pipeline.py delete mode 100644 api/process/process.py create mode 100644 api/questions/__init__.py create mode 100644 api/questions/model_builders/__init__.py rename api/{process/questionbuilder => questions/model_builders}/fib.py (98%) rename api/{process/questionbuilder => questions/model_builders}/matching.py (98%) rename api/{process/questionbuilder => questions/model_builders}/multiplechoice.py (96%) rename api/{process/questionbuilder => questions/model_builders}/multipleselect.py (92%) rename api/{process/questionbuilder => questions/model_builders}/ordering.py (94%) rename api/{process/questionbuilder => questions/model_builders}/truefalse.py (97%) rename api/{process/questionbuilder => questions/model_builders}/writtenresponse.py (94%) delete mode 100644 api/scorm/XmlReader.py delete mode 100644 api/scorm/XmlWriter.py delete mode 100644 api/scorm/manifest.py delete mode 100644 restapi/__init__.py delete mode 100644 restapi/admin.py delete mode 100644 restapi/apps.py delete mode 100644 restapi/logging/ErrorTypes.py delete mode 100644 restapi/logging/WarningTypes.py delete mode 100644 restapi/logging/contextfilter.py delete mode 100644 restapi/logging/logging_adapter.py delete mode 100644 restapi/models.py delete mode 100644 restapi/process/common/extract_images.py delete mode 100644 restapi/process/common/process_helper.py delete mode 100644 restapi/process/common/restore_images.py delete mode 100644 restapi/process/endanswers.py delete mode 100644 restapi/process/formatter/convert_txt.py delete mode 100644 restapi/process/formatter/fix_numbering.py delete mode 100644 restapi/process/formatter/formatter.py delete mode 100644 restapi/process/questionparser/questionparser.py delete mode 100644 restapi/process/sectioner/sectioner.py delete mode 100644 restapi/process/splitter/splitter.py delete mode 100644 restapi/serializers.py delete mode 100644 restapi/tasks.py delete mode 100644 restapi/tests.py delete mode 100644 restapi/urls.py delete mode 100644 restapi/views.py diff --git a/Dockerfile b/Dockerfile index e3e66ed..ed40aee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -132,7 +132,6 @@ COPY --from=builder /usr/src /antlr_build/ COPY qcon qcon/ COPY api api/ COPY pandoc pandoc/ -COPY restapi restapi/ ENTRYPOINT ["docker-entrypoint.sh"] diff --git a/api/consumers.py b/api/consumers.py index 5b3e9dc..3dae884 100644 --- a/api/consumers.py +++ b/api/consumers.py @@ -3,27 +3,21 @@ from django.core.files.base import ContentFile import base64 from os.path import normpath -from .models import Question, Section, QuestionLibrary, \ - Image, MultipleChoice, MultipleChoiceAnswer, TrueFalse, Fib, MultipleSelect, MultipleSelectAnswer, \ - Matching, MatchingAnswer, MatchingChoice, Ordering, WrittenResponse -import re +from .models import QuestionLibrary import logging newlogger = logging.getLogger(__name__) from .logging.logging_adapter import FilenameLoggingAdapter # from .logging.contextfilter import QuestionlibraryFilenameFilter # logger.addFilter(QuestionlibraryFilenameFilter()) -from .logging.ErrorTypes import EMFImageError -from .process.process_helper import add_error_message, html_to_plain, trim_text -from .serializers import JsonResponseSerializer -from .process.process import Process +from .pipelines.response_payload import build_response_payload, build_status_payload +from .pipelines.ws_pipeline import Process -from .process.extract_images import ImageExtractError -from .process.formatter import FormatterError -from .process.sectioner import SectionerError -from .process.splitter import SplitterError -from .process.endanswers import EndAnswerError -from .process.parser import ParserError -from .tasks import MarkDownConversionError +from .formats.docx.extract_images import ImageExtractError +from .formats.docx.formatter import FormatterError +from .formats.docx.sectioner import SectionerError +from .formats.docx.splitter import SplitterError +from .formats.docx.endanswers import EndAnswerError +from .formats.docx.parser import ParserError # class FilenameLoggingAdapter(logging.LoggerAdapter): @@ -49,47 +43,6 @@ def disconnect(self, close_code): newlogger.info("Closing Connection") # self.channel_layer.group_discard(self.sessionid, self.channel_name) - # Replace image marker with actual img element and return a boolean - def replace_image(self, obj, key, process, logger): - regex = r"(?<=<<<<)\d+(?=>>>>)" - obj_text = getattr(obj, key) - is_image = None - if obj_text: - is_image = re.search(regex, obj_text) - - if is_image != None: - obj_name = obj._meta.model.__name__ - if obj_name == "Question": - logger.debug(f'Adding Image(s) to Question #{obj.number_provided}') - elif obj_name == "Section": - logger.debug(f'Adding Image(s) to Section "{obj.title}"') - else: - logger.debug(f'Adding Image(s) to a {obj_name}') - - image_ids = list(set(re.findall(regex, obj_text))) - for image_id in image_ids: - image = process.questionlibrary.get_image(int(image_id)) - img_src = image.image - placeholder = "<<<<" + image_id + ">>>>" - - if re.match(r"\' - add_error_message(obj, error_message) - raise EMFImageError(obj.error) - except Exception as e: - logger.error(e) - - obj_text = re.sub(placeholder, lambda x: image.image, obj_text) - - setattr(obj, key, obj_text) - obj.save() - return True - return False - - - def receive_json(self, content, **kwargs): ########################################### @@ -125,9 +78,17 @@ def receive_json(self, content, **kwargs): logger.info("File Saved") except Exception as e: logger.error("Not a valid .docx File: {e}") - self.send(text_data=json.dumps(process.sendformat("Error", "Not a valid .docx File", ""))) + error_payload = build_status_payload( + "Error", + "Not a valid .docx File", + "", + process=None, + questionlibrary=None, + ) + self.send(text_data=json.dumps(error_payload)) # close connection - self.send(text_data=json.dumps(process.sendformat("Close", "", ""))) + close_payload = build_status_payload("Close", "", "", process=None, questionlibrary=None) + self.send(text_data=json.dumps(close_payload)) return ########################################### @@ -140,10 +101,23 @@ def receive_json(self, content, **kwargs): logger.info("Pandoc DONE") except Exception as e: logger.error(str(e)) - self.send( - text_data=json.dumps(process.sendformat("Error", "File unreadable", ""))) + error_payload = build_status_payload( + "Error", + "File unreadable", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(error_payload)) # close connection - self.send(text_data=json.dumps(process.sendformat("Close", "", ""))) + close_payload = build_status_payload( + "Close", + "", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(close_payload)) # return # except Exception as e: # self.send(text_data=json.dumps(process.sendformat("Close", "", ""))) @@ -199,9 +173,23 @@ def receive_json(self, content, **kwargs): logger.info("Formatter DONE") except FormatterError as e: logger.error("FormatterError: " + str(e)) - self.send(text_data=json.dumps(process.sendformat("Error", "No contents found in the body of the file", ""))) + error_payload = build_status_payload( + "Error", + "No contents found in the body of the file", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(error_payload)) # close connection - self.send(text_data=json.dumps(process.sendformat("Close", "", ""))) + close_payload = build_status_payload( + "Close", + "", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(close_payload)) return else: self.send(text_data=json.dumps(process.sendformat("Busy", "Content Body detected", ""))) @@ -215,9 +203,23 @@ def receive_json(self, content, **kwargs): logger.info("Sectioner DONE") except SectionerError as e: logger.error("SectionerError: " + str(e)) - self.send(text_data=json.dumps(process.sendformat("Error", "Sections can not be identified", ""))) + error_payload = build_status_payload( + "Error", + "Sections can not be identified", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(error_payload)) # close connection - self.send(text_data=json.dumps(process.sendformat("Close", "", ""))) + close_payload = build_status_payload( + "Close", + "", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(close_payload)) return else: self.send(text_data=json.dumps(process.sendformat("Busy", "Section found: " + str(process.subsection_count), ""))) @@ -231,9 +233,23 @@ def receive_json(self, content, **kwargs): logger.info("Splitter DONE") except Exception as e: logger.error("SplitterError: " + str(e)) - self.send(text_data=json.dumps(process.sendformat("Error", "Splitter failed", ""))) + error_payload = build_status_payload( + "Error", + "Splitter failed", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(error_payload)) # close connection - self.send(text_data=json.dumps(process.sendformat("Close", "", ""))) + close_payload = build_status_payload( + "Close", + "", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(close_payload)) return else: self.send(text_data=json.dumps(process.sendformat("Busy", "Question found: " + str(process.questions_expected), ""))) @@ -260,171 +276,39 @@ def receive_json(self, content, **kwargs): logger.info("Parser DONE") except Exception as e: logger.error("ParserError: " + str(e)) - self.send(text_data=json.dumps(process.sendformat("Error", "Parser failed", ""))) + error_payload = build_status_payload( + "Error", + "Parser failed", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(error_payload)) # close connection - self.send(text_data=json.dumps(process.sendformat("Close", "", ""))) + close_payload = build_status_payload( + "Close", + "", + "", + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(close_payload)) else: self.send(text_data=json.dumps(process.sendformat("Busy", "Parser complete", ""))) -########################################### - # Loop All Sections and Questions to count error, add/replace images, and add question.title -########################################### - logger.debug("Start Adding Images back ...") - try: - # select all sections for this QL - sections = process.questionlibrary.get_sections() - for section in sections: - - # DO NOT DELETE: replace images in section.text - section_replace_image = self.replace_image(section, "text", process, logger) - - # select all questions for this QL - questions = Question.objects.filter(section=section) - - for question in questions: - is_table = False - img_replaced = False - -########################################### - # count all question level errors -########################################### - # logger.debug("count all question level errors ...") - if question.info is not None: - process.question_info_count += 1 - - if question.warning is not None: - process.question_warning_count += 1 - - if question.error is not None: - process.question_error_count += 1 - - - is_table = re.search(r"", question.text) or is_table - -########################################### - # replace Image placeholder for questions -########################################### - - # replace image in question.text if exist - img_replaced = self.replace_image(question, 'text', process, logger) or img_replaced - - match(question.questiontype): - case 'MC': - #Check MC - MC_answer_objects = MultipleChoiceAnswer.objects.filter(multiple_choice__question=question) - for answer in MC_answer_objects: - img_replaced = self.replace_image(answer, 'answer', process, logger) or img_replaced - is_table = re.search(r"", answer.answer) or is_table - if answer.answer_feedback is not None: - img_replaced = self.replace_image(answer, 'answer_feedback', process, logger) or img_replaced - is_table = re.search(r"", answer.answer_feedback) or is_table - case 'TF': - #Check TF - TF_object = TrueFalse.objects.filter(question=question) - for tf in TF_object: - if tf.true_feedback is not None: - img_replaced = self.replace_image(tf, 'true_feedback', process, logger) or img_replaced - is_table = re.search(r"", tf.true_feedback) or is_table - if tf.false_feedback is not None: - img_replaced = self.replace_image(tf, 'false_feedback', process, logger) or img_replaced - is_table = re.search(r"", tf.false_feedback) or is_table - case 'FIB' | 'FMB': - #Check FIB - FIB_object = Fib.objects.filter(question=question) - for fib_question in FIB_object: - img_replaced = self.replace_image(fib_question, 'text', process, logger) or img_replaced - is_table = re.search(r"", fib_question.text) or is_table - case 'MS' | 'MR': - #Check MS - MS_answer_objects = MultipleSelectAnswer.objects.filter(multiple_select__question=question) - for answer in MS_answer_objects: - img_replaced = self.replace_image(answer, 'answer', process, logger) or img_replaced - is_table = re.search(r"", answer.answer) or is_table - if answer.answer_feedback is not None: - img_replaced = self.replace_image(answer, 'answer_feedback', process, logger) or img_replaced - is_table = re.search(r"", answer.answer_feedback) or is_table - case 'ORD': - #Check ORD - ORD_objects = Ordering.objects.filter(question=question) - for ordering in ORD_objects: - if ordering.text is not None: - img_replaced = self.replace_image(ordering, 'text', process, logger) or img_replaced - is_table = re.search(r"", ordering.text) or is_table - if ordering.ord_feedback is not None: - img_replaced = self.replace_image(ordering, 'ord_feedback', process, logger) or img_replaced - is_table = re.search(r"", ordering.ord_feedback) or is_table - case 'MAT' | 'MT': - #Check MAT answer - MAT_answer_objects = MatchingAnswer.objects.filter(matching_choice__matching__question=question) - for mat_answer in MAT_answer_objects: - if mat_answer.answer_text is not None: - img_replaced = self.replace_image(mat_answer, 'answer_text', process, logger) or img_replaced - is_table = re.search(r"", mat_answer.answer_text) or is_table - #Check MAT choice - MAT_choice_objects = MatchingChoice.objects.filter(matching__question=question) - for mat_choice in MAT_choice_objects: - if mat_choice.choice_text is not None: - img_replaced = self.replace_image(mat_choice, 'choice_text', process, logger) or img_replaced - is_table = re.search(r"", mat_choice.choice_text) or is_table - case 'WR' | 'E': - #Check WR - WR_objects = WrittenResponse.objects.filter(question=question) - for wr in WR_objects: - if wr.initial_text is not None: - img_replaced = self.replace_image(wr, 'initial_text', process, logger) or img_replaced - is_table = re.search(r"", wr.initial_text) or is_table - if wr.answer_key is not None: - img_replaced = self.replace_image(wr, 'answer_key', process, logger) or img_replaced - is_table = re.search(r"", wr.answer_key) or is_table - - -########################################### - # Add question.title -########################################### - prefix = '' - - if is_table: - prefix = '[TABLE]' + prefix - if img_replaced: - prefix = '[IMG]' + prefix - - # Save question.title - if question.title is None: - title_text = question.text - title_text = title_text.replace('\n', ' ') - title_text = re.sub(r"", "[IMG]", title_text) - title_text = re.sub(r"", "[TABLE]", title_text) - title_text = re.sub(r"<<<<\d+>>>>", "[IMG]", title_text) - - if question.questiontype == 'FIB' or question.questiontype == 'FMB': - title_text = re.sub(r"\[(.*?)\]", "_______", title_text) - - title_text = html_to_plain(title_text) - title_text = trim_text(title_text) - - if prefix != '': - prefix = prefix + ' ' - title_text = re.sub(r"\s*\[IMG\]", "", title_text).strip() - title_text = re.sub(r"\s*\[TABLE\]", "", title_text).strip() - - title_text = prefix + title_text - question.title = title_text[0:127] - question.save() - - except Exception as e: - logger.error(e) - - logger.debug("Adding Images back DONE") - - - -########################################### # serialize and send response ########################################### logger.info("Process End") - serialized_ql = JsonResponseSerializer(process.questionlibrary) - self.send(text_data=json.dumps(process.sendformat("Done", "", serialized_ql.data))) + json_data = build_response_payload(process.questionlibrary, preview=True) + done_payload = build_status_payload( + "Done", + "", + json_data, + process=process, + questionlibrary=process.questionlibrary, + ) + self.send(text_data=json.dumps(done_payload)) ######################### Close Connection self.send(text_data=json.dumps(process.sendformat("Close", "", ""))) diff --git a/api/formats/__init__.py b/api/formats/__init__.py new file mode 100644 index 0000000..dc75237 --- /dev/null +++ b/api/formats/__init__.py @@ -0,0 +1 @@ +# Package for supported content formats. diff --git a/api/formats/docx/__init__.py b/api/formats/docx/__init__.py new file mode 100644 index 0000000..cda4da3 --- /dev/null +++ b/api/formats/docx/__init__.py @@ -0,0 +1 @@ +# DOCX format handlers. diff --git a/api/process/convert_txt.py b/api/formats/docx/convert_txt.py similarity index 100% rename from api/process/convert_txt.py rename to api/formats/docx/convert_txt.py diff --git a/api/process/endanswers.py b/api/formats/docx/endanswers.py similarity index 97% rename from api/process/endanswers.py rename to api/formats/docx/endanswers.py index 7ceedd8..0082ea3 100644 --- a/api/process/endanswers.py +++ b/api/formats/docx/endanswers.py @@ -1,7 +1,7 @@ import os import subprocess import xml.etree.ElementTree as ET -from ..models import EndAnswer +from ...models import EndAnswer import re def get_endanswers(questionlibrary): diff --git a/api/process/extract_images.py b/api/formats/docx/extract_images.py similarity index 97% rename from api/process/extract_images.py rename to api/formats/docx/extract_images.py index d184946..e8a298e 100644 --- a/api/process/extract_images.py +++ b/api/formats/docx/extract_images.py @@ -1,5 +1,5 @@ import re -from ..models import Image +from ...models import Image def extract_images(questionlibrary): try: diff --git a/api/process/fix_numbering.py b/api/formats/docx/fix_numbering.py similarity index 100% rename from api/process/fix_numbering.py rename to api/formats/docx/fix_numbering.py diff --git a/api/process/formatter.py b/api/formats/docx/formatter.py similarity index 100% rename from api/process/formatter.py rename to api/formats/docx/formatter.py diff --git a/api/process/parser.py b/api/formats/docx/parser.py similarity index 98% rename from api/process/parser.py rename to api/formats/docx/parser.py index 4761d3b..659eece 100644 --- a/api/process/parser.py +++ b/api/formats/docx/parser.py @@ -1,6 +1,6 @@ import os import xml.etree.ElementTree as ET -from ..models import EndAnswer, Section, Question +from ...models import EndAnswer, Section, Question from django.conf import settings import logging diff --git a/api/process/process_helper.py b/api/formats/docx/process_helper.py similarity index 100% rename from api/process/process_helper.py rename to api/formats/docx/process_helper.py diff --git a/api/process/sectioner.py b/api/formats/docx/sectioner.py similarity index 99% rename from api/process/sectioner.py rename to api/formats/docx/sectioner.py index 0856d3b..7d3d06e 100644 --- a/api/process/sectioner.py +++ b/api/formats/docx/sectioner.py @@ -3,7 +3,7 @@ import xml.etree.ElementTree as ET # from .process_helper import markdown_to_plain, trim_text, markdown_to_html from api.tasks import markdown_to_plain, trim_text, markdown_to_html -from ..models import Section +from ...models import Section import logging newlogger = logging.getLogger(__name__) diff --git a/api/process/splitter.py b/api/formats/docx/splitter.py similarity index 99% rename from api/process/splitter.py rename to api/formats/docx/splitter.py index 391c371..caf227f 100644 --- a/api/process/splitter.py +++ b/api/formats/docx/splitter.py @@ -1,8 +1,8 @@ import os import subprocess import xml.etree.ElementTree as ET -from ..models import Section -from ..models import Question +from ...models import Section +from ...models import Question # from .process_helper import trim_text from api.tasks import trim_text import logging diff --git a/api/formats/scorm/__init__.py b/api/formats/scorm/__init__.py new file mode 100644 index 0000000..9d3416e --- /dev/null +++ b/api/formats/scorm/__init__.py @@ -0,0 +1 @@ +# SCORM format handlers. diff --git a/api/formats/scorm/manifest.py b/api/formats/scorm/manifest.py new file mode 100644 index 0000000..46f6374 --- /dev/null +++ b/api/formats/scorm/manifest.py @@ -0,0 +1,79 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + +import xml.etree.cElementTree as ET + +NS_D2L = "http://desire2learn.com/xsd/d2lcp_v2p0" +NS_IMS = "http://www.imsglobal.org/xsd/imscp_v1p1" + + +class ManifestEntity(object): + resources = [] + + def __init__(self): + del self.resources[:] + + def add_resource(self, manifest_resource_entity): + self.resources.append(manifest_resource_entity) + + +class ManifestResourceEntity(object): + def __init__(self, identifier, resource_type, material_type, href, title = '', link_target = ''): + self.identifier = identifier + self.resource_type = resource_type + self.material_type = material_type + self.href = href + self.title = title + self.link_target = link_target + + +def build_manifest_tree(manifest_entity: ManifestEntity, identifier: str = "MANIFEST_1") -> ET.ElementTree: + """ + Build an imsmanifest.xml tree using shared namespaces/constants. + """ + root = ET.Element( + "manifest", + {"xmlns:d2l_2p0": NS_D2L, "xmlns": NS_IMS, "identifier": identifier}, + ) + resources_el = ET.SubElement(root, "resources") + for resource in manifest_entity.resources: + ET.SubElement( + resources_el, + "resource", + { + "identifier": resource.identifier, + "type": resource.resource_type, + "d2l_2p0:material_type": resource.material_type, + "href": resource.href, + "d2l_2p0:link_target": resource.link_target, + "title": resource.title, + }, + ) + return ET.ElementTree(root) + + +def parse_manifest_tree(tree: ET.ElementTree) -> dict: + """ + Parse an imsmanifest.xml ElementTree into a simple dict structure + consistent with XmlReader.parse_manifest output. + """ + root = tree.getroot() + manifest_data = { + "identifier": root.get("identifier", ""), + "resources": [], + } + resources_el = root.find("resources") + if resources_el is not None: + for resource_el in resources_el.findall("resource"): + manifest_data["resources"].append( + { + "identifier": resource_el.get("identifier", ""), + "type": resource_el.get("type", ""), + "material_type": resource_el.get(f"{{{NS_D2L}}}material_type", ""), + "href": resource_el.get("href", ""), + "link_target": resource_el.get(f"{{{NS_D2L}}}link_target", ""), + "title": resource_el.get("title", ""), + } + ) + return manifest_data \ No newline at end of file diff --git a/api/formats/scorm/manifest_builder.py b/api/formats/scorm/manifest_builder.py new file mode 100644 index 0000000..7db3106 --- /dev/null +++ b/api/formats/scorm/manifest_builder.py @@ -0,0 +1,29 @@ +import xml.etree.cElementTree as ET + + +def build_manifest(manifest_entity): + root = ET.Element( + "manifest", + { + "xmlns:d2l_2p0": "http://desire2learn.com/xsd/d2lcp_v2p0", + "xmlns": "http://www.imsglobal.org/xsd/imscp_v1p1", + "identifier": "MANIFEST_1", + }, + ) + doc = ET.SubElement(root, "resources") + + for resource in manifest_entity.resources: + ET.SubElement( + doc, + "resource", + { + "identifier": resource.identifier, + "type": resource.resource_type, + "d2l_2p0:material_type": resource.material_type, + "href": resource.href, + "d2l_2p0:link_target": resource.link_target, + "title": resource.title, + }, + ) + + return ET.ElementTree(root) diff --git a/api/formats/scorm/scorm_extractor.py b/api/formats/scorm/scorm_extractor.py new file mode 100644 index 0000000..fa7151d --- /dev/null +++ b/api/formats/scorm/scorm_extractor.py @@ -0,0 +1,255 @@ +from api.models import ( + QuestionLibrary, + Section, + Question, + MultipleChoice, + MultipleChoiceAnswer, + TrueFalse, + Fib, + MultipleSelect, + MultipleSelectAnswer, + Matching, + MatchingChoice, + MatchingAnswer, + Ordering, + WrittenResponse, +) + +from .scorm_unzipper import extract_scorm_zip +from .scorm_parser import ScormParser + + +class ScormExtractor: + """ + Import SCORM XML data into Django models. + """ + + def __init__(self, scorm_zip_path, extract_to_path=None): + self.scorm_zip_path = scorm_zip_path + self.extracted_path = extract_scorm_zip(scorm_zip_path, extract_to_path) + self.parser = ScormParser(self.extracted_path) + + def parse_manifest(self): + return self.parser.parse_manifest() + + def parse_questiondb(self): + return self.parser.parse_questiondb() + + def populate_django_models(self, question_library=None): + """ + Populate Django models from parsed SCORM XML data. + + Args: + question_library: Optional existing QuestionLibrary instance to use. + If None, a new one will be created. + + Returns: + QuestionLibrary: The QuestionLibrary instance with all sections and questions + """ + question_library_data = self.parse_questiondb() + + main_title = "" + if question_library_data["sections"]: + main_title = question_library_data["sections"][0].get("title", "") + + if question_library is None: + question_library = QuestionLibrary.objects.create( + main_title=main_title, + shuffle=False, + ) + else: + question_library.main_title = main_title + question_library.save() + + section_order = 1 + question_index = 1 + for section_data in question_library_data["sections"]: + has_nested_sections = len(section_data.get("sections", [])) > 0 + has_direct_questions = len(section_data.get("questions", [])) > 0 + has_text = section_data.get("text", "").strip() != "" + should_set_main_text = ( + has_text + # and section_data.get("is_text_displayed", False) + and not question_library.main_text + ) + if should_set_main_text: + question_library.main_text = section_data.get("text", "") + question_library.save(update_fields=["main_text"]) + + if has_direct_questions or has_text: + section = Section.objects.create( + question_library=question_library, + is_main_content=True, + order=section_order, + title=section_data.get("title", ""), + is_title_displayed=section_data.get("is_title_displayed", True), + text=section_data.get("text", ""), + is_text_displayed=section_data.get("is_text_displayed", False), + shuffle=section_data.get("shuffle", False), + ) + + for question_data in section_data.get("questions", []): + self._create_question_model(section, question_data, question_index) + question_index += 1 + + for nested_section_data in section_data.get("sections", []): + nested_section = Section.objects.create( + question_library=question_library, + is_main_content=False, + order=section_order + 1, + title=nested_section_data.get("title", ""), + is_title_displayed=nested_section_data.get("is_title_displayed", True), + text=nested_section_data.get("text", ""), + is_text_displayed=nested_section_data.get("is_text_displayed", False), + shuffle=nested_section_data.get("shuffle", False), + ) + + for question_data in nested_section_data.get("questions", []): + self._create_question_model(nested_section, question_data, question_index) + question_index += 1 + + section_order += 1 + + section_order += 1 + elif has_nested_sections: + for nested_section_data in section_data.get("sections", []): + nested_section = Section.objects.create( + question_library=question_library, + is_main_content=False, + order=section_order, + title=nested_section_data.get("title", ""), + is_title_displayed=nested_section_data.get("is_title_displayed", True), + text=nested_section_data.get("text", ""), + is_text_displayed=nested_section_data.get("is_text_displayed", False), + shuffle=section_data.get("shuffle", False), + ) + + for question_data in nested_section_data.get("questions", []): + self._create_question_model(nested_section, question_data, question_index) + question_index += 1 + + section_order += 1 + + return question_library + + def _create_question_model(self, section, question_data, index): + question = Question.objects.create( + section=section, + index=index, + title=question_data.get("title", ""), + questiontype=question_data.get("question_type_code", ""), + text=question_data.get("text", ""), + points=question_data.get("points", 1.0), + hint=question_data.get("hint"), + feedback=question_data.get("feedback"), + ) + + question_type_code = question_data.get("question_type_code", "") + specific_data = question_data.get("question_specific_data", {}) + + if question_type_code == "MC": + self._create_multiple_choice_model(question, specific_data) + elif question_type_code == "TF": + self._create_true_false_model(question, specific_data) + elif question_type_code == "FIB": + self._create_fib_model(question, specific_data) + elif question_type_code == "MS": + self._create_multiple_select_model(question, specific_data) + elif question_type_code == "MAT": + self._create_matching_model(question, specific_data) + elif question_type_code == "ORD": + self._create_ordering_model(question, specific_data) + elif question_type_code == "WR": + self._create_written_response_model(question, specific_data) + + return question + + def _create_multiple_choice_model(self, question, mc_data): + mc = MultipleChoice.objects.create( + question=question, + randomize=mc_data.get("randomize", False), + enumeration=mc_data.get("enumeration", 4), + ) + + for answer_data in mc_data.get("answers", []): + MultipleChoiceAnswer.objects.create( + multiple_choice=mc, + order=answer_data.get("order", 1), + answer=answer_data.get("answer", ""), + answer_feedback=answer_data.get("answer_feedback"), + weight=answer_data.get("weight", 0.0), + ) + + def _create_true_false_model(self, question, tf_data): + TrueFalse.objects.create( + question=question, + true_weight=tf_data.get("true_weight", 0.0), + true_feedback=tf_data.get("true_feedback"), + false_weight=tf_data.get("false_weight", 0.0), + false_feedback=tf_data.get("false_feedback"), + enumeration=tf_data.get("enumeration", 4), + ) + + def _create_fib_model(self, question, fib_data): + for fib_item in fib_data.get("fibs", []): + Fib.objects.create( + question=question, + type=fib_item.get("type", "fibquestion"), + text=fib_item.get("text", ""), + order=fib_item.get("order", 1), + size=fib_item.get("size"), + ) + + def _create_multiple_select_model(self, question, ms_data): + ms = MultipleSelect.objects.create( + question=question, + randomize=ms_data.get("randomize", False), + enumeration=ms_data.get("enumeration", 4), + style=ms_data.get("style", 2), + grading_type=ms_data.get("grading_type", 2), + ) + + for answer_data in ms_data.get("answers", []): + MultipleSelectAnswer.objects.create( + multiple_select=ms, + order=answer_data.get("order", 1), + answer=answer_data.get("answer", ""), + answer_feedback=answer_data.get("answer_feedback"), + is_correct=answer_data.get("is_correct", False), + ) + + def _create_matching_model(self, question, mat_data): + matching = Matching.objects.create( + question=question, + grading_type=mat_data.get("grading_type", 0), + ) + + for choice_data in mat_data.get("choices", []): + matching_choice = MatchingChoice.objects.create( + matching=matching, + choice_text=choice_data.get("choice_text", ""), + ) + + for answer_data in choice_data.get("matching_answers", []): + MatchingAnswer.objects.create( + matching_choice=matching_choice, + answer_text=answer_data.get("answer_text", ""), + ) + + def _create_ordering_model(self, question, ord_data): + for item_data in ord_data.get("items", []): + Ordering.objects.create( + question=question, + text=item_data.get("text", ""), + order=item_data.get("order", 1), + ord_feedback=item_data.get("ord_feedback"), + ) + + def _create_written_response_model(self, question, wr_data): + WrittenResponse.objects.create( + question=question, + enable_student_editor=wr_data.get("enable_student_editor", False), + initial_text=wr_data.get("initial_text"), + answer_key=wr_data.get("answer_key", ""), + enable_attachments=wr_data.get("enable_attachments", False), + ) diff --git a/api/formats/scorm/scorm_formatter.py b/api/formats/scorm/scorm_formatter.py new file mode 100644 index 0000000..0b06f69 --- /dev/null +++ b/api/formats/scorm/scorm_formatter.py @@ -0,0 +1,415 @@ +import re +from bs4 import BeautifulSoup + + +class ScormFormatter: + """ + Format question library models into markdown and DOCX. + """ + + def _html_to_markdown(self, html_text): + """ + Convert HTML text with base64 images to markdown format. + Preserves ALL tags as HTML. Converts MathML to TeX when possible. + """ + if not html_text: + return "" + + img_pattern = r"]*?>" + + html_images = {} + image_counter = 0 + + def preserve_img_tag(match): + nonlocal image_counter + full_img_tag = match.group(0) + placeholder = f"__HTML_IMAGE_{image_counter}__" + html_images[placeholder] = full_img_tag + image_counter += 1 + return placeholder + + math_blocks = {} + math_counter = 0 + math_pattern = r"" + + def preserve_math(match): + nonlocal math_counter + full_math = match.group(0) + placeholder = f"__MATH_BLOCK_{math_counter}__" + tex_match = re.search( + r']*encoding=["\']application/x-tex["\'][^>]*>(.*?)', + full_math, + flags=re.IGNORECASE | re.DOTALL, + ) + tex = tex_match.group(1) if tex_match else None + math_blocks[placeholder] = {"tex": tex, "raw": full_math} + math_counter += 1 + return placeholder + + result = re.sub(img_pattern, preserve_img_tag, html_text) + result = re.sub(math_pattern, preserve_math, result, flags=re.IGNORECASE) + + result = re.sub(r"

", "\n", result, flags=re.IGNORECASE) + result = re.sub(r"]*>", "\n", result, flags=re.IGNORECASE) + + try: + soup = BeautifulSoup(result, "html.parser") + for br in soup.find_all("br"): + br.replace_with("[[[BR]]]") + text = soup.get_text(separator=" ", strip=False) + text = text.replace("[[[BR]]]", "\n") + except Exception: + text = re.sub(r"<(?!/?__HTML_IMAGE_)[^>]+>", "", result) + + for placeholder, math_info in math_blocks.items(): + replacement = None + if math_info.get("tex"): + tex = math_info["tex"].strip() + replacement = f"$$ {tex} $$" + else: + replacement = math_info.get("raw", "") + text = text.replace(placeholder, replacement) + for placeholder, html_img in html_images.items(): + text = text.replace(placeholder, html_img) + + text = text.replace("\r", "") + text = re.sub(r"\n{3,}", "\n\n", text) + normalized_lines = [] + for line in text.split("\n"): + stripped = line.strip() + if stripped == "": + normalized_lines.append("") + continue + if ( + re.search(r"]*>", stripped, flags=re.IGNORECASE) + or re.search(r"\s+<", "><", result) + result = re.sub(r"\s+", " ", result) + result = result.strip() + return result + except Exception: + cleaned = re.sub(r">\s+<", "><", html_text) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + return cleaned + + def format_to_markdown(self, question_library): + """ + Format parsed questions from Django models into markdown/text format. + """ + lines = [] + + if question_library.main_title: + main_title = question_library.main_title + try: + soup = BeautifulSoup(main_title, "html.parser") + main_title = soup.get_text(separator=" ", strip=True) + except Exception: + main_title = re.sub(r"\s+", " ", main_title).strip() + lines.append(f"# {main_title}") + lines.append("") + + if getattr(question_library, "main_text", None): + main_text = self._html_to_markdown(question_library.main_text) + lines.append(main_text) + lines.append("") + + sections = question_library.get_sections() + for section in sections: + if not section.is_main_content: + if section.title and section.is_title_displayed: + section_title_display = section.title + try: + soup = BeautifulSoup(section_title_display, "html.parser") + section_title_display = soup.get_text(separator=" ", strip=True) + except Exception: + section_title_display = re.sub(r"\s+", " ", section_title_display).strip() + lines.append("") + lines.append("
") + lines.append("#section") + lines.append(f"## {section_title_display}") + + should_display_text = False + if section.is_main_content: + should_display_text = section.text and section.is_text_displayed + else: + should_display_text = bool(section.text) + + if should_display_text: + section_text = self._html_to_markdown(section.text) + lines.append(section_text) + + questions = section.get_questions() + for idx, question in enumerate(questions): + question_markdown = self._format_question_to_markdown(question) + lines.append(question_markdown) + + if not section.is_main_content and idx == len(questions) - 1: + lines.append("") + lines.append("
") + lines.append("/section") + + if not section.is_main_content and len(questions) == 0: + lines.append("") + lines.append("
") + lines.append("/section") + + result = "\n".join(lines) + if result and not result.endswith("\n"): + result += "\n" + return result + + def _format_question_to_markdown(self, question): + """ + Format a single question to markdown format matching raw_content format. + """ + lines = [] + + if question.questiontype: + lines.append("") + lines.append("
") + lines.append(f"Type: {question.questiontype}") + if question.title: + lines.append(f"Title: {question.title}") + if question.points: + normalized_points = str(float(question.points)).rstrip("0").rstrip(".") + lines.append(f"Points: {normalized_points}") + + randomize_value = None + if question.questiontype == "MC": + mc = question.get_multiple_choice() + if mc and mc.randomize is not None: + randomize_value = mc.randomize + elif question.questiontype == "MS": + ms = question.get_multiple_select() + if ms and ms.randomize is not None: + randomize_value = ms.randomize + if randomize_value is True: + lines.append("Randomize: yes") + + if question.text and question.questiontype != "FIB": + question_text = self._html_to_markdown(question.text) + plain_text = re.sub(r"!\[.*?\]\([^)]+\)", "", question_text) + plain_text = re.sub(r"<[^>]+>", "", plain_text) + plain_text = re.sub(r"\s+", " ", plain_text).strip() + + question_number = None + if question.index is not None: + question_number = question.index + elif question.number_provided is not None: + question_number = question.number_provided + + if question_number is not None: + lines.append(f"{question_number}. {question_text}") + else: + lines.append(question_text) + + question_type = question.questiontype + if question_type == "MC": + answer_text = self._format_multiple_choice_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == "TF": + answer_text = self._format_true_false_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == "FIB": + answer_text = self._format_fib_markdown(question) + if answer_text: + question_number = None + if question.index is not None: + question_number = question.index + elif question.number_provided is not None: + question_number = question.number_provided + + if question_number is not None: + lines.append(f"{question_number}. {answer_text}") + else: + lines.append(answer_text) + elif question_type == "MS": + answer_text = self._format_multi_select_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == "MAT": + answer_text = self._format_matching_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == "ORD": + answer_text = self._format_ordering_markdown(question) + if answer_text: + lines.append(answer_text) + elif question_type == "WR": + answer_text = self._format_written_response_markdown(question) + if answer_text: + lines.append(answer_text) + + if question.hint: + hint_text = self._html_to_markdown(question.hint) + lines.append(f"@Hint: {hint_text}") + + if question.feedback: + feedback_text = self._html_to_markdown(question.feedback) + lines.append(f"@Feedback: {feedback_text}") + + return "\n\n".join(lines) + + def _format_multiple_choice_markdown(self, question): + lines = [] + mc = question.get_multiple_choice() + if mc: + answers = mc.get_multiple_choice_answers() + for idx, answer in enumerate(answers, start=1): + letter = chr(96 + idx) + marker = "*" if answer.weight and answer.weight > 0 else "" + answer_text = self._html_to_markdown(answer.answer) + lines.append(f" {letter}. {marker}{answer_text}") + if answer.answer_feedback: + feedback_text = self._html_to_markdown(answer.answer_feedback) + lines.append(f" @Feedback: {feedback_text}") + return "\n".join(lines) + + def _format_true_false_markdown(self, question): + lines = [] + tf = question.get_true_false() + if tf: + true_marker = "*" if tf.true_weight and tf.true_weight > 0 else "" + false_marker = "*" if tf.false_weight and tf.false_weight > 0 else "" + lines.append(f" a. {true_marker}True") + if tf.true_feedback: + feedback_text = self._html_to_markdown(tf.true_feedback) + lines.append(f" @Feedback: {feedback_text}") + lines.append(f" b. {false_marker}False") + if tf.false_feedback: + feedback_text = self._html_to_markdown(tf.false_feedback) + lines.append(f" @Feedback: {feedback_text}") + return "\n".join(lines) + + def _format_fib_markdown(self, question): + lines = [] + fibs = question.get_fibs() + current_text = "" + for fib in fibs: + if fib.type == "fibquestion": + if fib.text: + cleaned_text = self._html_to_markdown(fib.text) + current_text += cleaned_text + elif fib.type == "fibanswer": + if fib.text: + current_text += f" [{fib.text}]" + else: + current_text += " [ ]" + if current_text: + lines.append(current_text) + return "\n".join(lines) + + def _format_multi_select_markdown(self, question): + lines = [] + ms = question.get_multiple_select() + if ms: + answers = ms.get_multiple_select_answers() + for idx, answer in enumerate(answers, start=1): + letter = chr(96 + idx) + marker = "*" if answer.is_correct else "" + answer_text = self._html_to_markdown(answer.answer) + lines.append(f" {letter}. {marker}{answer_text}") + if answer.answer_feedback: + feedback_text = self._html_to_markdown(answer.answer_feedback) + lines.append(f" @Feedback: {feedback_text}") + return "\n".join(lines) + + def _format_matching_markdown(self, question): + lines = [] + matching = question.get_matching() + if matching: + choices = matching.get_matching_choices() + for idx, choice in enumerate(choices, start=1): + letter = chr(96 + idx) + choice_text = self._html_to_markdown(choice.choice_text) + + answers = choice.matching_answers.all() + if answers: + answer = answers[0] + answer_text = self._html_to_markdown(answer.answer_text) + lines.append(f" {letter}. {choice_text} = {answer_text}") + else: + lines.append(f" {letter}. {choice_text} =") + return "\n".join(lines) + + def _format_ordering_markdown(self, question): + lines = [] + orderings = question.get_orderings() + for idx, ordering in enumerate(orderings, start=1): + letter = chr(96 + idx) + ordering_text = self._html_to_markdown(ordering.text) + lines.append(f" {letter}. {ordering_text}") + if ordering.ord_feedback: + feedback_text = self._html_to_markdown(ordering.ord_feedback) + lines.append(f" @Feedback: {feedback_text}") + return "\n".join(lines) + + def _format_written_response_markdown(self, question): + lines = [] + wr = question.get_written_response() + if wr and wr.answer_key: + lines.append("") + answer_text = self._html_to_markdown(wr.answer_key) + lines.append("Correct Answer:") + lines.append(f"{answer_text}") + return "\n\n".join(lines) + + def convert_markdown_to_docx(self, markdown_text, output_path): + """ + Convert markdown text to DOCX file using pandoc. + """ + import pypandoc + import tempfile + import os + + with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False, encoding="utf-8") as temp_md: + temp_md.write(markdown_text) + temp_md_path = temp_md.name + + try: + pypandoc.convert_file( + temp_md_path, + format="markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars", + to="docx+empty_paragraphs", + outputfile=output_path, + extra_args=[ + "--no-highlight", + "--preserve-tabs", + "--wrap=preserve", + "--indent=false", + "--mathml", + "--ascii", + ], + ) + finally: + if os.path.exists(temp_md_path): + os.unlink(temp_md_path) + + return output_path diff --git a/api/formats/scorm/scorm_parser.py b/api/formats/scorm/scorm_parser.py new file mode 100644 index 0000000..b9f0655 --- /dev/null +++ b/api/formats/scorm/scorm_parser.py @@ -0,0 +1,845 @@ +import os +import re +import base64 +import html +import xml.etree.cElementTree as ET +from os import path +from bs4 import BeautifulSoup + + +class ScormParser: + """ + Parse SCORM XML files (questiondb.xml, imsmanifest.xml) into dicts. + """ + + def __init__(self, extracted_path): + self.extracted_path = extracted_path + self.questiondb_xml = None + self.imsmanifest_xml = None + self._parse_xml_files() + + def _parse_xml_files(self): + """Parse questiondb.xml and imsmanifest.xml from extracted files.""" + questiondb_path = path.join(self.extracted_path, "questiondb.xml") + imsmanifest_path = path.join(self.extracted_path, "imsmanifest.xml") + + if not path.exists(questiondb_path): + raise FileNotFoundError(f"questiondb.xml not found in SCORM package: {questiondb_path}") + + if not path.exists(imsmanifest_path): + raise FileNotFoundError(f"imsmanifest.xml not found in SCORM package: {imsmanifest_path}") + + self.questiondb_xml = ET.parse(questiondb_path) + self.imsmanifest_xml = ET.parse(imsmanifest_path) + + def parse_manifest(self): + """ + Parse imsmanifest.xml and extract metadata. + + Returns: + dict: Dictionary containing manifest metadata + """ + root = self.imsmanifest_xml.getroot() + + manifest_data = { + "identifier": root.get("identifier", ""), + "resources": [], + } + + resources_el = root.find("resources") + if resources_el is not None: + for resource_el in resources_el.findall("resource"): + resource_data = { + "identifier": resource_el.get("identifier", ""), + "type": resource_el.get("type", ""), + "material_type": resource_el.get("{http://desire2learn.com/xsd/d2lcp_v2p0}material_type", ""), + "href": resource_el.get("href", ""), + "link_target": resource_el.get("{http://desire2learn.com/xsd/d2lcp_v2p0}link_target", ""), + "title": resource_el.get("title", ""), + } + manifest_data["resources"].append(resource_data) + + return manifest_data + + def parse_questiondb(self): + """ + Parse questiondb.xml and extract question library structure. + + Returns: + dict: Dictionary containing question library data structure + """ + root = self.questiondb_xml.getroot() + objectbank_el = root.find("objectbank") + if objectbank_el is None: + raise ValueError("objectbank element not found in questiondb.xml") + + question_library_data = { + "ident": objectbank_el.get("ident", ""), + "sections": [], + } + + base_sections = objectbank_el.findall("section") + for section_el in base_sections: + section_data = self._parse_section(section_el) + question_library_data["sections"].append(section_data) + + return question_library_data + + def _parse_section(self, section_el): + """ + Parse a section element and extract section data. + """ + section_data = { + "ident": section_el.get("ident", ""), + "title": section_el.get("title", ""), + "shuffle": False, + "is_title_displayed": True, + "is_text_displayed": False, + "text": "", + "questions": [], + } + + selection_ordering = section_el.find("selection_ordering") + if selection_ordering is not None: + order_el = selection_ordering.find("order") + if order_el is not None and order_el.get("order_type") == "Random": + section_data["shuffle"] = True + + presentation_material = section_el.find("presentation_material") + if presentation_material is not None: + text = self._extract_material_text(presentation_material) + section_data["text"] = text + + sectionproc = section_el.find("sectionproc_extension") + if sectionproc is not None: + display_name = sectionproc.find("{http://desire2learn.com/xsd/d2lcp_v2p0}display_section_name") + if display_name is not None: + section_data["is_title_displayed"] = display_name.text.lower() == "yes" + + type_display = sectionproc.find("{http://desire2learn.com/xsd/d2lcp_v2p0}type_display_section") + if type_display is not None: + section_data["is_text_displayed"] = type_display.text == "1" + + nested_sections = section_el.findall("section") + for nested_section_el in nested_sections: + nested_section_data = self._parse_section(nested_section_el) + section_data["sections"] = section_data.get("sections", []) + section_data["sections"].append(nested_section_data) + + items = section_el.findall("item") + for item_el in items: + question_data = self._parse_question(item_el) + section_data["questions"].append(question_data) + + return section_data + + def _parse_question(self, item_el): + """ + Parse a question (item) element and extract question data. + """ + question_data = { + "ident": item_el.get("ident", ""), + "label": item_el.get("label", ""), + "title": item_el.get("title", ""), + "question_type": None, + "points": 1.0, + "text": "", + "hint": None, + "feedback": None, + "question_specific_data": {}, + } + + itemmetadata = item_el.find("itemmetadata") + if itemmetadata is not None: + qtidata = itemmetadata.find("qtimetadata") + if qtidata is not None: + for field in qtidata.findall("qti_metadatafield"): + fieldlabel = field.find("fieldlabel") + fieldentry = field.find("fieldentry") + if fieldlabel is not None and fieldentry is not None: + if fieldlabel.text == "qmd_questiontype": + question_data["question_type"] = fieldentry.text + elif fieldlabel.text == "qmd_weighting": + try: + question_data["points"] = float(fieldentry.text) + except (ValueError, TypeError): + pass + + presentation = item_el.find("presentation") + if presentation is not None: + question_text = self._extract_question_text(presentation) + question_data["text"] = question_text + + hint_el = item_el.find("hint") + if hint_el is not None: + question_data["hint"] = self._extract_hint_text(hint_el) + + feedback_els = item_el.findall("itemfeedback") + for feedback_el in feedback_els: + if feedback_el.get("ident") == question_data["label"]: + question_data["feedback"] = self._extract_feedback_text(feedback_el) + + question_type = question_data["question_type"] + if question_type: + if question_type == "Multiple Choice": + question_data["question_specific_data"] = self._parse_multiple_choice(item_el, question_data["label"]) + question_data["question_type_code"] = "MC" + elif question_type == "True/False": + question_data["question_specific_data"] = self._parse_true_false(item_el, question_data["label"]) + question_data["question_type_code"] = "TF" + elif question_type == "Fill in the Blanks": + question_data["question_specific_data"] = self._parse_fill_in_the_blanks(item_el, question_data["label"]) + question_data["question_type_code"] = "FIB" + elif question_type == "Multi-Select": + question_data["question_specific_data"] = self._parse_multi_select(item_el, question_data["label"]) + question_data["question_type_code"] = "MS" + elif question_type == "Matching": + question_data["question_specific_data"] = self._parse_matching(item_el, question_data["label"]) + question_data["question_type_code"] = "MAT" + elif question_type == "Ordering": + question_data["question_specific_data"] = self._parse_ordering(item_el, question_data["label"]) + question_data["question_type_code"] = "ORD" + elif question_type == "Long Answer": + question_data["question_specific_data"] = self._parse_written_response(item_el, question_data["label"]) + question_data["question_type_code"] = "WR" + + return question_data + + def _extract_material_text(self, material_el): + """ + Extract text content from material element, handling CDATA and images. + """ + text_parts = [] + + flow_mat = material_el.find("flow_mat") + if flow_mat is not None: + materials = flow_mat.findall(".//material") + for material in materials: + mattext = material.find("mattext") + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + if mattext.tail: + raw_text += mattext.tail + decoded_text = html.unescape(raw_text) + cleaned_text = self._clean_cdata(decoded_text) + cleaned_text = self._inline_scorm_images(cleaned_text) + text_parts.append(cleaned_text) + + return "".join(text_parts) + + def _extract_question_text(self, presentation_el): + """ + Extract question text from presentation element. + """ + text_parts = [] + + flow = presentation_el.find("flow") + if flow is not None: + material = flow.find("material") + if material is not None: + mattext = material.find("mattext") + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + if mattext.tail: + raw_text += mattext.tail + decoded_text = html.unescape(raw_text) + cleaned_text = self._clean_cdata(decoded_text) + cleaned_text = self._inline_scorm_images(cleaned_text) + text_parts.append(cleaned_text) + + return "".join(text_parts) + + def _extract_hint_text(self, hint_el): + """Extract text from hint element.""" + hintmaterial = hint_el.find("hintmaterial") + if hintmaterial is not None: + return self._extract_material_text(hintmaterial) + return None + + def _extract_feedback_text(self, feedback_el): + """ + Extract text from feedback element. + """ + material = feedback_el.find("material") + if material is not None: + mattext = material.find("mattext") + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + decoded_text = html.unescape(raw_text) + cleaned_text = self._clean_cdata(decoded_text) + return self._inline_scorm_images(cleaned_text) + return None + + def _clean_cdata(self, text): + """ + Normalize whitespace from CDATA sections while preserving HTML tags. + """ + if not text: + return "" + + try: + cleaned = re.sub(r"[ \t\n\r]+", " ", text) + cleaned = re.sub(r">\s+<", "><", cleaned) + cleaned = cleaned.strip() + return cleaned + except Exception: + cleaned = re.sub(r"\s+", " ", text).strip() + return cleaned + + def _inline_scorm_images(self, html_text): + """ + Convert SCORM image file paths to base64 data URIs in HTML text. + """ + if not html_text or not self.extracted_path: + return html_text + + img_pattern = r']*?)src=["\']([^"\']+)["\']([^>]*?)>' + + def replace_image(match): + before_src = match.group(1) + img_src = match.group(2) + after_src = match.group(3) + + if img_src.startswith("data:") or "base64" in img_src: + return match.group(0) + + if img_src.startswith("http://") or img_src.startswith("https://"): + return match.group(0) + + try: + img_path = img_src.lstrip("./") + possible_paths = [ + path.join(self.extracted_path, img_path), + path.join(self.extracted_path, "assessment-assets", path.basename(img_path)), + ] + + image_file = None + for possible_path in possible_paths: + if path.exists(possible_path) and path.isfile(possible_path): + image_file = possible_path + break + + if not image_file: + for root, dirs, files in os.walk(self.extracted_path): + if path.basename(img_path) in files: + image_file = path.join(root, path.basename(img_path)) + break + + if image_file and path.exists(image_file): + with open(image_file, "rb") as f: + image_data = f.read() + base64_data = base64.b64encode(image_data).decode("utf-8") + + ext = path.splitext(image_file)[1].lower() + mime_types = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".svg": "image/svg+xml", + ".webp": "image/webp", + } + mime_type = mime_types.get(ext, "image/png") + + base64_src = f"data:{mime_type};base64,{base64_data}" + import logging + logger = logging.getLogger(__name__) + logger.info( + f"Converted SCORM image {path.basename(image_file)} to base64 ({len(base64_data)} chars)" + ) + return f'' + else: + import logging + logger = logging.getLogger(__name__) + logger.warning(f"SCORM image not found: {img_src} (searched in {self.extracted_path})") + return match.group(0) + except Exception: + return match.group(0) + + result = re.sub(img_pattern, replace_image, html_text) + return result + + def _parse_multiple_choice(self, item_el, question_ident): + """ + Parse multiple choice question data. + """ + mc_data = { + "randomize": False, + "enumeration": 4, + "answers": [], + } + + presentation = item_el.find("presentation") + if presentation is None: + return mc_data + + flow = presentation.find("flow") + if flow is None: + return mc_data + + response_ext = flow.find("response_extension") + if response_ext is not None: + enumeration_el = response_ext.find("{http://desire2learn.com/xsd/d2lcp_v2p0}enumeration") + if enumeration_el is not None and enumeration_el.text: + try: + mc_data["enumeration"] = int(enumeration_el.text) + except (ValueError, TypeError): + pass + + response_lid = flow.find("response_lid") + if response_lid is not None: + render_choice = response_lid.find("render_choice") + if render_choice is not None: + mc_data["randomize"] = render_choice.get("shuffle", "no").lower() == "yes" + + question_lid = response_lid.get("ident", "") + answer_index = 1 + for flow_label in response_lid.findall(".//flow_label"): + response_label = flow_label.find("response_label") + if response_label is not None: + answer_ident = response_label.get("ident", "") + mattext = response_label.find(".//mattext") + answer_text = "" + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + decoded_text = html.unescape(raw_text) + answer_text = self._clean_cdata(decoded_text) + + weight = 0.0 + answer_feedback = None + resprocessing = item_el.find("resprocessing") + if resprocessing is not None: + for respcondition in resprocessing.findall("respcondition"): + conditionvar = respcondition.find("conditionvar") + if conditionvar is not None: + varequal = conditionvar.find("varequal") + if varequal is not None and varequal.get("respident") == question_lid: + if varequal.text == answer_ident: + setvar = respcondition.find("setvar") + if setvar is not None: + try: + weight = float(setvar.text) + except (ValueError, TypeError): + pass + + displayfeedback = respcondition.find("displayfeedback") + if displayfeedback is not None: + feedback_ident = displayfeedback.get("linkrefid", "") + feedback_el = item_el.find( + f".//itemfeedback[@ident='{feedback_ident}']" + ) + if feedback_el is not None: + answer_feedback = self._extract_feedback_text(feedback_el) + + mc_data["answers"].append( + { + "answer": answer_text, + "weight": weight, + "answer_feedback": answer_feedback, + "order": answer_index, + } + ) + answer_index += 1 + + return mc_data + + def _parse_true_false(self, item_el, question_ident): + """ + Parse true/false question data. + """ + tf_data = { + "true_weight": 0.0, + "true_feedback": None, + "false_weight": 0.0, + "false_feedback": None, + "enumeration": 4, + } + + presentation = item_el.find("presentation") + if presentation is None: + return tf_data + + flow = presentation.find("flow") + if flow is None: + return tf_data + + response_ext = flow.find("response_extension") + if response_ext is not None: + enumeration_el = response_ext.find("{http://desire2learn.com/xsd/d2lcp_v2p0}enumeration") + if enumeration_el is not None and enumeration_el.text: + try: + tf_data["enumeration"] = int(enumeration_el.text) + except (ValueError, TypeError): + pass + + response_lid = flow.find("response_lid") + if response_lid is not None: + question_lid = response_lid.get("ident", "") + + render_choice = response_lid.find("render_choice") + true_ident = None + false_ident = None + if render_choice is not None: + response_labels = render_choice.findall(".//response_label") + if len(response_labels) >= 1: + true_ident = response_labels[0].get("ident", "") + if len(response_labels) >= 2: + false_ident = response_labels[1].get("ident", "") + + resprocessing = item_el.find("resprocessing") + + if resprocessing is not None: + for respcondition in resprocessing.findall("respcondition"): + conditionvar = respcondition.find("conditionvar") + if conditionvar is not None: + varequal = conditionvar.find("varequal") + if varequal is not None and varequal.get("respident") == question_lid: + answer_ident = varequal.text + + if true_ident and answer_ident == true_ident: + setvar = respcondition.find("setvar") + if setvar is not None: + try: + tf_data["true_weight"] = float(setvar.text) + except (ValueError, TypeError): + pass + + displayfeedback = respcondition.find("displayfeedback") + if displayfeedback is not None: + feedback_ident = displayfeedback.get("linkrefid", "") + feedback_el = item_el.find( + f".//itemfeedback[@ident='{feedback_ident}']" + ) + if feedback_el is not None: + tf_data["true_feedback"] = self._extract_feedback_text(feedback_el) + + elif false_ident and answer_ident == false_ident: + setvar = respcondition.find("setvar") + if setvar is not None: + try: + tf_data["false_weight"] = float(setvar.text) + except (ValueError, TypeError): + pass + + displayfeedback = respcondition.find("displayfeedback") + if displayfeedback is not None: + feedback_ident = displayfeedback.get("linkrefid", "") + feedback_el = item_el.find( + f".//itemfeedback[@ident='{feedback_ident}']" + ) + if feedback_el is not None: + tf_data["false_feedback"] = self._extract_feedback_text(feedback_el) + + return tf_data + + def _parse_fill_in_the_blanks(self, item_el, question_ident): + """ + Parse fill in the blanks question data. + """ + fib_data = {"fibs": []} + + presentation = item_el.find("presentation") + if presentation is None: + return fib_data + + flow = presentation.find("flow") + if flow is None: + return fib_data + + idx = 1 + for child in flow: + if child.tag == "material": + mattext = child.find("mattext") + text = "" + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + text = html.unescape(raw_text) + + fib_data["fibs"].append({"type": "fibquestion", "text": text, "order": idx}) + + elif child.tag == "response_str": + question_ans = question_ident + str(idx) + "_ANS" + + answers = [] + resprocessing = item_el.find("resprocessing") + if resprocessing is not None: + for respcondition in resprocessing.findall("respcondition"): + conditionvar = respcondition.find("conditionvar") + if conditionvar is not None: + varequal = conditionvar.find("varequal") + if varequal is not None and varequal.get("respident") == question_ans: + answer_text = varequal.text if varequal.text else "" + if answer_text: + answers.append(answer_text) + + fib_data["fibs"].append( + { + "type": "fibanswer", + "text": ",".join(answers) if answers else "", + "order": idx, + "size": 30, + } + ) + idx += 1 + + return fib_data + + def _parse_multi_select(self, item_el, question_ident): + """ + Parse multi-select question data. + """ + ms_data = { + "randomize": False, + "enumeration": 4, + "style": 2, + "grading_type": 2, + "answers": [], + } + + presentation = item_el.find("presentation") + if presentation is None: + return ms_data + + flow = presentation.find("flow") + if flow is None: + return ms_data + + response_ext = flow.find("response_extension") + if response_ext is not None: + enumeration_el = response_ext.find("{http://desire2learn.com/xsd/d2lcp_v2p0}enumeration") + if enumeration_el is not None and enumeration_el.text: + try: + ms_data["enumeration"] = int(enumeration_el.text) + except (ValueError, TypeError): + pass + + grading_type_el = response_ext.find("{http://desire2learn.com/xsd/d2lcp_v2p0}grading_type") + if grading_type_el is not None and grading_type_el.text: + try: + ms_data["grading_type"] = int(grading_type_el.text) + except (ValueError, TypeError): + pass + + response_lid = flow.find("response_lid") + if response_lid is not None: + question_lid = response_lid.get("ident", "") + + render_choice = response_lid.find("render_choice") + if render_choice is not None: + ms_data["randomize"] = render_choice.get("shuffle", "no").lower() == "yes" + + answer_index = 1 + for flow_label in response_lid.findall(".//flow_label"): + response_label = flow_label.find("response_label") + if response_label is not None: + answer_ident = response_label.get("ident", "") + + mattext = response_label.find(".//mattext") + answer_text = "" + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + decoded_text = html.unescape(raw_text) + answer_text = self._clean_cdata(decoded_text) + + is_correct = False + answer_feedback = None + resprocessing = item_el.find("resprocessing") + if resprocessing is not None: + for respcondition in resprocessing.findall("respcondition"): + conditionvar = respcondition.find("conditionvar") + if conditionvar is not None: + varequal = conditionvar.find("varequal") + if varequal is not None and varequal.get("respident") == question_lid: + if varequal.text == answer_ident: + setvar = respcondition.find("setvar") + if setvar is not None: + if setvar.get("varname") == "D2L_Correct": + is_correct = True + + displayfeedback = respcondition.find("displayfeedback") + if displayfeedback is not None: + feedback_ident = displayfeedback.get("linkrefid", "") + feedback_el = item_el.find( + f".//itemfeedback[@ident='{feedback_ident}']" + ) + if feedback_el is not None: + answer_feedback = self._extract_feedback_text(feedback_el) + + ms_data["answers"].append( + { + "answer": answer_text, + "is_correct": is_correct, + "answer_feedback": answer_feedback, + "order": answer_index, + } + ) + answer_index += 1 + + return ms_data + + def _parse_matching(self, item_el, question_ident): + """ + Parse matching question data. + """ + mat_data = { + "grading_type": 0, + "choices": [], + } + + presentation = item_el.find("presentation") + if presentation is None: + return mat_data + + flow = presentation.find("flow") + if flow is None: + return mat_data + + response_ext = flow.find("response_extension") + if response_ext is not None: + grading_type_el = response_ext.find("{http://desire2learn.com/xsd/d2lcp_v2p0}grading_type") + if grading_type_el is not None and grading_type_el.text: + try: + mat_data["grading_type"] = int(grading_type_el.text) + except (ValueError, TypeError): + pass + + matching_answers = {} + response_grps = flow.findall("response_grp") + + for response_grp in response_grps: + render_choice = response_grp.find("render_choice") + if render_choice is not None: + for response_label in render_choice.findall(".//response_label"): + answer_ident = response_label.get("ident", "") + mattext = response_label.find(".//mattext") + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + answer_text = self._clean_cdata(raw_text) + if answer_text and answer_ident not in matching_answers: + matching_answers[answer_ident] = answer_text + + for response_grp in response_grps: + choice_ident = response_grp.get("respident", "") + + material = response_grp.find("material") + choice_text = "" + if material is not None: + mattext = material.find("mattext") + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + decoded_text = html.unescape(raw_text) + choice_text = self._clean_cdata(decoded_text) + + correct_answer_ident = None + resprocessing = item_el.find("resprocessing") + if resprocessing is not None: + for respcondition in resprocessing.findall("respcondition"): + conditionvar = respcondition.find("conditionvar") + if conditionvar is not None: + varequal = conditionvar.find("varequal") + if varequal is not None and varequal.get("respident") == choice_ident: + setvar = respcondition.find("setvar") + if setvar is not None and setvar.get("varname") == "D2L_Correct": + correct_answer_ident = varequal.text + break + + matching_answers_list = [] + if correct_answer_ident and correct_answer_ident in matching_answers: + matching_answers_list.append({"answer_text": matching_answers[correct_answer_ident]}) + + mat_data["choices"].append( + {"choice_text": choice_text, "matching_answers": matching_answers_list} + ) + + return mat_data + + def _parse_ordering(self, item_el, question_ident): + """ + Parse ordering question data. + """ + ord_data = {"items": []} + + presentation = item_el.find("presentation") + if presentation is None: + return ord_data + + flow = presentation.find("flow") + if flow is None: + return ord_data + + response_grp = flow.find('response_grp[@rcardinality="Ordered"]') + if response_grp is None: + return ord_data + + render_choice = response_grp.find("render_choice") + if render_choice is None: + return ord_data + + order_index = 1 + for response_label in render_choice.findall(".//response_label"): + ident_num = response_label.get("ident", "") + + mattext = response_label.find(".//mattext") + text = "" + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + decoded_text = html.unescape(raw_text) + text = self._clean_cdata(decoded_text) + + ord_feedback = None + question_ident_feedback = question_ident + "_IF" + feedback_ident = question_ident_feedback + str(order_index) + feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") + if feedback_el is not None: + ord_feedback = self._extract_feedback_text(feedback_el) + + ord_data["items"].append( + {"text": text, "order": order_index, "ord_feedback": ord_feedback} + ) + order_index += 1 + + return ord_data + + def _parse_written_response(self, item_el, question_ident): + """ + Parse written response question data. + """ + wr_data = { + "enable_student_editor": False, + "initial_text": None, + "answer_key": "", + "enable_attachments": False, + } + + presentation = item_el.find("presentation") + if presentation is not None: + flow = presentation.find("flow") + if flow is not None: + response_ext = flow.find("response_extension") + if response_ext is not None: + editor_el = response_ext.find("{http://desire2learn.com/xsd/d2lcp_v2p0}has_htmleditor") + if editor_el is not None: + editor_text = editor_el.text if editor_el.text else "" + wr_data["enable_student_editor"] = editor_text.lower() == "yes" + + answer_key_el = item_el.find("answer_key") + if answer_key_el is not None: + answer_key_mat = answer_key_el.find("answer_key_material") + if answer_key_mat is not None: + mattext = answer_key_mat.find(".//mattext") + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + wr_data["answer_key"] = self._clean_cdata(raw_text) + + initial_text_el = item_el.find("initial_text") + if initial_text_el is not None: + initial_text_mat = initial_text_el.find("initial_text_material") + if initial_text_mat is not None: + mattext = initial_text_mat.find(".//mattext") + if mattext is not None: + raw_text = mattext.text if mattext.text else "" + decoded_text = html.unescape(raw_text) + cleaned_text = self._clean_cdata(decoded_text) + wr_data["initial_text"] = cleaned_text if cleaned_text else None + + return wr_data diff --git a/api/formats/scorm/scorm_question_builder.py b/api/formats/scorm/scorm_question_builder.py new file mode 100644 index 0000000..e664648 --- /dev/null +++ b/api/formats/scorm/scorm_question_builder.py @@ -0,0 +1,23 @@ +from .xml_builders import ( + BaseQuestionBuilder, + MultipleChoiceBuilder, + TrueFalseBuilder, + FillInTheBlanksBuilder, + MultiSelectBuilder, + MatchingBuilder, + OrderingBuilder, + WrittenResponseBuilder, +) + + +class ScormQuestionBuilder( + BaseQuestionBuilder, + MultipleChoiceBuilder, + TrueFalseBuilder, + FillInTheBlanksBuilder, + MultiSelectBuilder, + MatchingBuilder, + OrderingBuilder, + WrittenResponseBuilder, +): + pass diff --git a/api/formats/scorm/scorm_unzipper.py b/api/formats/scorm/scorm_unzipper.py new file mode 100644 index 0000000..286328b --- /dev/null +++ b/api/formats/scorm/scorm_unzipper.py @@ -0,0 +1,30 @@ +from os import path, makedirs +from zipfile import ZipFile +from django.conf import settings + + +def extract_scorm_zip(scorm_zip_path, extract_to_path=None): + """ + Extract a SCORM ZIP file and return the extraction path. + + Args: + scorm_zip_path: Path to the SCORM ZIP file + extract_to_path: Optional path to extract ZIP contents + + Returns: + str: Path where the ZIP was extracted + """ + if not path.exists(scorm_zip_path): + raise FileNotFoundError(f"SCORM ZIP file not found: {scorm_zip_path}") + + if extract_to_path is None: + zip_basename = path.splitext(path.basename(scorm_zip_path))[0] + extract_to_path = path.join(settings.MEDIA_ROOT, f"scorm_extract_{zip_basename}") + + if not path.exists(extract_to_path): + makedirs(extract_to_path) + + with ZipFile(scorm_zip_path, "r") as zip_ref: + zip_ref.extractall(extract_to_path) + + return extract_to_path diff --git a/api/formats/scorm/scorm_writer.py b/api/formats/scorm/scorm_writer.py new file mode 100644 index 0000000..46d0846 --- /dev/null +++ b/api/formats/scorm/scorm_writer.py @@ -0,0 +1,141 @@ +import datetime +import random +import time +import xml.etree.cElementTree as ET +from uuid import UUID +from xml.dom.minidom import parseString + +from .scorm_question_builder import ScormQuestionBuilder +from .xmlcdata import CDATA + + +class ScormWriter(ScormQuestionBuilder): + def __init__(self, question_library): + ident = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") + questionLibraryIdent = "QLIB_" + ident + root_el = ET.Element( + "questestinterop", + {"xmlns:d2l_2p0": "http://desire2learn.com/xsd/d2lcp_v2p0"}, + ) + objectbank_el = ET.SubElement( + root_el, + "objectbank", + {"ident": questionLibraryIdent, "xmlns:d2l_2p0": "http://desire2learn.com/xsd/d2lcp_v2p0"}, + ) + + base_ident = "SECT_" + str(datetime.datetime.now().strftime("%Y%m%d%H%M%S")) + str( + int(UUID(int=0x12345678123456781234567812345678)) + ) + base_section_el = ET.SubElement( + objectbank_el, + "section", + {"ident": base_ident, "title": self._safe_attr(question_library.main_title)}, + ) + if question_library.shuffle is True: + self.create_section_shuffle(base_section_el) + + self.create_presentation_material(base_section_el, question_library.main_text) + + sec_proc = ET.SubElement(base_section_el, "sectionproc_extension") + sec_proc_dis_name = ET.SubElement(sec_proc, "d2l_2p0:display_section_name") + sec_proc_dis_name.text = "yes" + sec_proc_dis_line = ET.SubElement(sec_proc, "d2l_2p0:display_section_line") + sec_proc_dis_line.text = "no" + sec_proc_dis_sec = ET.SubElement(sec_proc, "d2l_2p0:type_display_section") + sec_proc_dis_sec.text = "0" + + section_objs = question_library.get_sections() + for section_obj in section_objs: + if section_obj.is_main_content is True: + root_question_objs = section_obj.get_questions() + self.create_questions(base_section_el, root_question_objs) + else: + current_section_el = self.create_section(base_section_el, section_obj) + question_objs = section_obj.get_questions() + self.create_questions(current_section_el, question_objs) + self.questiondb_string = self.xml_to_string(root_el) + + def _safe_attr(self, value): + return "" if value is None else str(value) + + def create_section(self, parent_el, section_obj): + sectionIdent = "SECT_" + str(datetime.datetime.now().strftime("%Y%m%d%H%M%S")) + str( + int(UUID(int=0x12345678123456781234567812345678)) + ) + section_el = ET.SubElement( + parent_el, + "section", + {"ident": sectionIdent, "title": self._safe_attr(section_obj.title)}, + ) + if section_obj.shuffle is True: + self.create_section_shuffle(section_el) + + self.create_presentation_material(section_el, section_obj.text) + self.create_sectionproc_extension(section_el, section_obj) + + return section_el + + def create_section_shuffle(self, section_el): + sel_ord = ET.SubElement(section_el, "selection_ordering") + sel_ord_ord = ET.SubElement(sel_ord, "order", {"order_type": "Random"}) + + def create_presentation_material(self, section_el, section_text): + sec_pres_mat = ET.SubElement(section_el, "presentation_material") + sec_pres_mat_flo = ET.SubElement(sec_pres_mat, "flow_mat") + sec_pres_mat_flo_flo = ET.SubElement(sec_pres_mat_flo, "flow_mat") + sec_pres_mat_flo_flo_mat = ET.SubElement(sec_pres_mat_flo_flo, "material") + sec_pres_mat_flo_flo_mat_text = ET.SubElement(sec_pres_mat_flo_flo_mat, "mattext", {"texttype": "text/html"}) + if section_text: + sec_pres_mat_flo_flo_mat_text.append(CDATA(section_text)) + + def create_sectionproc_extension(self, section_el, section_obj): + sec_proc = ET.SubElement(section_el, "sectionproc_extension") + sec_proc_dis_name = ET.SubElement(sec_proc, "d2l_2p0:display_section_name") + sec_proc_dis_name.text = "yes" if section_obj.is_title_displayed in (None, True) else "no" + sec_proc_dis_line = ET.SubElement(sec_proc, "d2l_2p0:display_section_line") + sec_proc_dis_line.text = "no" + sec_proc_dis_sec = ET.SubElement(sec_proc, "d2l_2p0:type_display_section") + if section_obj.is_text_displayed is None: + sec_proc_dis_sec.text = "0" + else: + sec_proc_dis_sec.text = "1" if section_obj.is_text_displayed else "0" + + def create_questions(self, section_el, question_objs): + for question in question_objs: + time_ns = str(time.process_time_ns()) + random_int = str(random.randint(1000000, 9999999)) + ident = time_ns + random_int + question_ident = "QUES_" + ident + item_el = ET.Element( + "item", + { + "ident": "OBJ_" + ident, + "label": question_ident, + "d2l_2p0:page": "1", + "title": self._safe_attr(question.title), + }, + ) + question_type = question.questiontype + match question_type: + case "MC": + self.generate_multiple_choice(item_el, question_ident, question) + case "TF": + self.generate_true_false(item_el, question_ident, question) + case "FIB" | "FMB": + self.generate_fill_in_the_blanks(item_el, question_ident, question) + case "MS" | "MR": + self.generate_multi_select(item_el, question_ident, question) + case "MAT" | "MT": + self.generate_matching(item_el, question_ident, question) + case "ORD": + self.generate_ordering(item_el, question_ident, question) + case "WR" | "E": + self.generate_written_response(item_el, question_ident, question) + + section_el.append(item_el) + + def xml_to_string(self, xml): + rough_string = ET.tostring(xml, "utf-8") + reparsed = parseString(rough_string) + pretty_xml = reparsed.toprettyxml(indent="\t") + return pretty_xml diff --git a/api/formats/scorm/xml_builders/__init__.py b/api/formats/scorm/xml_builders/__init__.py new file mode 100644 index 0000000..1d942aa --- /dev/null +++ b/api/formats/scorm/xml_builders/__init__.py @@ -0,0 +1,19 @@ +from .base import BaseQuestionBuilder +from .multiple_choice import MultipleChoiceBuilder +from .true_false import TrueFalseBuilder +from .fib import FillInTheBlanksBuilder +from .multi_select import MultiSelectBuilder +from .matching import MatchingBuilder +from .ordering import OrderingBuilder +from .written_response import WrittenResponseBuilder + +__all__ = [ + "BaseQuestionBuilder", + "MultipleChoiceBuilder", + "TrueFalseBuilder", + "FillInTheBlanksBuilder", + "MultiSelectBuilder", + "MatchingBuilder", + "OrderingBuilder", + "WrittenResponseBuilder", +] diff --git a/api/formats/scorm/xml_builders/base.py b/api/formats/scorm/xml_builders/base.py new file mode 100644 index 0000000..47f54f4 --- /dev/null +++ b/api/formats/scorm/xml_builders/base.py @@ -0,0 +1,57 @@ +import xml.etree.cElementTree as ET +from xml.dom.minidom import parseString + +from ..xmlcdata import CDATA + + +class BaseQuestionBuilder: + def itemetadata(self, it, question_type, question): + it_metadata = ET.SubElement(it, "itemmetadata") + it_metadata_qtidata = ET.SubElement(it_metadata, "qtimetadata") + it_computer_scored = ET.SubElement(it_metadata_qtidata, "qti_metadatafield") + it_computer_scored_label = ET.SubElement(it_computer_scored, "fieldlabel") + it_computer_scored_label.text = "qmd_computerscored" + it_computer_scored_entry = ET.SubElement(it_computer_scored, "fieldentry") + it_computer_scored_entry.text = "yes" + it_question_type = ET.SubElement(it_metadata_qtidata, "qti_metadatafield") + it_question_type_label = ET.SubElement(it_question_type, "fieldlabel") + it_question_type_label.text = "qmd_questiontype" + it_question_type_entry = ET.SubElement(it_question_type, "fieldentry") + it_question_type_entry.text = question_type + it_weighting = ET.SubElement(it_metadata_qtidata, "qti_metadatafield") + it_weighting_label = ET.SubElement(it_weighting, "fieldlabel") + it_weighting_label.text = "qmd_weighting" + it_weighting_entry = ET.SubElement(it_weighting, "fieldentry") + points = question.points if question.points is not None else 1 + it_weighting_entry.text = "{:.4f}".format(points) + + def itemproc_extension(self, it): + it_proc = ET.SubElement(it, "itemproc_extension") + it_proc_difficulty = ET.SubElement(it_proc, "d2l_2p0:difficulty") + it_proc_difficulty.text = "1" + it_proc_isbonus = ET.SubElement(it_proc, "d2l_2p0:isbonus") + it_proc_isbonus.text = "no" + it_proc_ismandatory = ET.SubElement(it_proc, "d2l_2p0:ismandatory") + it_proc_ismandatory.text = "no" + + def generate_feedback(self, it, ident, feedback): + it_fb = ET.SubElement(it, "itemfeedback", {"ident": ident}) + it_fb_mat = ET.SubElement(it_fb, "material") + it_fb_mat_text = ET.SubElement(it_fb_mat, "mattext", {"texttype": "text/html"}) + it_fb_mat_text.append(CDATA(feedback)) + + def generate_hint(self, it, hint): + it_hint = ET.SubElement(it, "hint") + it_hint_mat = ET.SubElement(it_hint, "hintmaterial") + it_hint_mat_flow = ET.SubElement(it_hint_mat, "flow_mat") + it_hint_mat_flow_mat = ET.SubElement(it_hint_mat_flow, "material") + it_hint_mat_flow_text = ET.SubElement( + it_hint_mat_flow_mat, "mattext", {"texttype": "text/html"} + ) + it_hint_mat_flow_text.append(CDATA(hint)) + + def xml_to_string(self, xml): + rough_string = ET.tostring(xml, "utf-8") + reparsed = parseString(rough_string) + pretty_xml = reparsed.toprettyxml(indent="\t") + return pretty_xml diff --git a/api/formats/scorm/xml_builders/fib.py b/api/formats/scorm/xml_builders/fib.py new file mode 100644 index 0000000..6f4cf68 --- /dev/null +++ b/api/formats/scorm/xml_builders/fib.py @@ -0,0 +1,71 @@ +import xml.etree.cElementTree as ET + +from ..xmlcdata import CDATA + + +class FillInTheBlanksBuilder: + def generate_fill_in_the_blanks(self, it, question_ident, question): + self.itemetadata(it, "Fill in the Blanks", question) + self.itemproc_extension(it) + + it_pre = ET.SubElement(it, "presentation") + it_pre_flow = ET.SubElement(it_pre, "flow") + + idx = 1 + for fib in question.get_fibs(): + question_str = question_ident + str(idx) + "_STR" + question_ans = question_ident + str(idx) + "_ANS" + if fib.type == "fibanswer": + it_pre_flow_str = ET.SubElement( + it_pre_flow, "response_str", {"rcardinality": "Single", "ident": question_str} + ) + it_pre_flow_str_render = ET.SubElement( + it_pre_flow_str, + "render_fib", + {"fibtype": "String", "prompt": "Box", "columns": "30", "rows": "1"}, + ) + ET.SubElement(it_pre_flow_str_render, "response_label", {"ident": question_ans}) + idx += 1 + elif fib.type == "fibquestion": + it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") + it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) + question_text = fib.text + it_pre_flow_mat_text.append(CDATA(question_text)) + + if question.hint: + self.generate_hint(it, question.hint) + + it_res = ET.SubElement(it, "resprocessing") + it_out = ET.SubElement(it_res, "outcomes") + + index = 1 + fib_answers_qs = list(question.get_fib_answers() or []) + if not fib_answers_qs: + return + answer_weight = str(100.0 / len(fib_answers_qs)) + for fib_answers in fib_answers_qs: + if not fib_answers.text: + index += 1 + continue + answers = [a.strip() for a in fib_answers.text.split(",") if a.strip()] + question_ans = question_ident + str(index) + "_ANS" + for answer in answers: + it_res_con = ET.SubElement(it_res, "respcondition") + it_res_con_var = ET.SubElement(it_res_con, "conditionvar") + it_res_con_var_equal = ET.SubElement( + it_res_con_var, "varequal", {"case": "no", "respident": question_ans} + ) + it_res_con_var_equal.text = answer + it_res_set_var = ET.SubElement(it_res_con, "setvar", {"action": "Set"}) + it_res_set_var.text = answer_weight + + ET.SubElement( + it_out, + "decvar", + {"varname": "Blank_" + str(index), "maxvalue": "100", "minvalue": "0", "vartype": "Integer"}, + ) + + index += 1 + + if question.feedback: + self.generate_feedback(it, question_ident, question.feedback) diff --git a/api/formats/scorm/xml_builders/matching.py b/api/formats/scorm/xml_builders/matching.py new file mode 100644 index 0000000..f3576c0 --- /dev/null +++ b/api/formats/scorm/xml_builders/matching.py @@ -0,0 +1,122 @@ +import copy +import xml.etree.cElementTree as ET + +from ..xmlcdata import CDATA + + +class MatchingBuilder: + def generate_matching(self, it, question_ident, question): + self.itemetadata(it, "Matching", question) + self.itemproc_extension(it) + matching = question.get_matching() + question_ident_choice = question_ident + "_C" + question_ident_answer = question_ident + "_A" + + it_pre = ET.SubElement(it, "presentation") + it_pre_flow = ET.SubElement(it_pre, "flow") + + if question.hint: + self.generate_hint(it, question.hint) + + it_res = ET.SubElement(it, "resprocessing") + it_res_out = ET.SubElement(it_res, "outcomes") + ET.SubElement(it_res_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "D2L_Correct", "minvalue": "0", "maxvalue": "100"}) + ET.SubElement(it_res_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "D2L_Incorrect", "minvalue": "0", "maxvalue": "100"}) + ET.SubElement(it_res_out, "decvar", {"vartype": "Decimal", "defaultval": "0", "varname": "que_score", "minvalue": "0", "maxvalue": "100"}) + + it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") + it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) + question_text = question.text + it_pre_flow_mat_text.append(CDATA(question_text)) + + it_pre_flow_res = ET.SubElement(it_pre_flow, "response_extension") + it_pre_flow_res_grading_type = ET.SubElement(it_pre_flow_res, "d2l_2p0:grading_type") + it_pre_flow_res_grading_type.text = "2" + + it_pre_flow_res_grp_ren = ET.Element("render_choice", {"shuffle": "yes"}) + it_pre_flow_res_grp_ren_flow = ET.SubElement(it_pre_flow_res_grp_ren, "flow_label", {"class": "Block"}) + + it_temp = ET.Element("temp") + matching_answers = matching.get_unique_matching_answers() + + ma_index = 1 + for matching_answer_text in matching_answers: + matching_answer_index = question_ident_answer + str(ma_index) + it_grp_ren_flow_lab = ET.SubElement(it_pre_flow_res_grp_ren_flow, "response_label", {"ident": matching_answer_index}) + it_grp_ren_flow_lab_flow = ET.SubElement(it_grp_ren_flow_lab, "flow_mat") + it_grp_ren_flow_lab_flow_mat = ET.SubElement(it_grp_ren_flow_lab_flow, "material") + it_grp_ren_flow_lab_flow_mat_text = ET.SubElement(it_grp_ren_flow_lab_flow_mat, "mattext", {"texttype": "text/html"}) + it_grp_ren_flow_lab_flow_mat_text.append(CDATA(matching_answer_text)) + + it_respcondition = ET.SubElement(it_temp, "respcondition") + it_respcondition_conditionvar = ET.SubElement(it_respcondition, "conditionvar") + it_respcondition_varequal = ET.SubElement(it_respcondition_conditionvar, "varequal") + it_respcondition_varequal.text = matching_answer_index + it_respcondition_setvar = ET.SubElement(it_respcondition, "setvar", {"action": "Add"}) + it_respcondition_setvar.text = "1" + + ma_index += 1 + + mc_index = 1 + for matching_choice in matching.get_matching_choices(): + matching_choice_index = question_ident_choice + str(mc_index) + + it_pre_flow_res_grp = ET.SubElement(it_pre_flow, "response_grp", {"respident": matching_choice_index, "rcardinality": "Single"}) + it_pre_flow_res_grp_mat = ET.SubElement(it_pre_flow_res_grp, "material") + it_pre_flow_res_grp_mattext = ET.SubElement(it_pre_flow_res_grp_mat, "mattext", {"texttype": "text/html"}) + it_pre_flow_res_grp_mattext.append(CDATA(matching_choice.choice_text)) + it_pre_flow_res_grp.append(it_pre_flow_res_grp_ren) + + for respcondition in it_temp: + conditionvar = respcondition.find("conditionvar") + varequal = conditionvar.find("varequal") + varequal.set("respident", matching_choice_index) + setvar = respcondition.find("setvar") + answer_mattext = it_pre_flow.find( + "response_grp[@respident='" + matching_choice_index + "'].//response_label[@ident='" + varequal.text + "'].//mattext" + ) + is_correct = matching_choice.has_matching_answer(answer_mattext[0].text) + if is_correct is True: + setvar.set("varname", "D2L_Correct") + else: + setvar.set("varname", "D2L_Incorrect") + it_res.append(copy.deepcopy(respcondition)) + mc_index += 1 + + match matching.grading_type: + case 0: + it_respcondition = ET.SubElement(it_res, "respcondition") + it_respcondition_var = ET.SubElement(it_respcondition, "conditionvar") + ET.SubElement(it_respcondition_var, "other") + it_resp_setvar = ET.SubElement(it_respcondition, "setvar", {"varname": "que_score", "action": "Set"}) + it_resp_setvar.text = "D2L_Correct" + case 1: + it_respcondition = ET.SubElement(it_res, "respcondition") + it_respcondition_var = ET.SubElement(it_respcondition, "conditionvar") + it_respcondition_var_vargte = ET.SubElement(it_respcondition_var, "vargte", {"respident": "D2L_Incorrect"}) + it_respcondition_var_vargte.text = "0" + it_resp_setvar = ET.SubElement(it_respcondition, "setvar", {"varname": "que_score", "action": "Set"}) + it_resp_setvar.text = "0" + + it_respcondition2 = copy.deepcopy(it_respcondition) + it_resp_setvar2 = it_respcondition2.find("setvar") + it_resp_setvar2.text = "1" + it_res.append(it_respcondition2) + case 2: + it_respcondition = ET.SubElement(it_res, "respcondition") + it_respcondition_var = ET.SubElement(it_respcondition, "conditionvar") + it_respcondition_var_vargte = ET.SubElement(it_respcondition_var, "vargte", {"respident": "D2L_Incorrect"}) + it_respcondition_var_vargte.text = "D2L_Correct" + it_resp_setvar = ET.SubElement(it_respcondition, "setvar", {"varname": "que_score", "action": "Set"}) + it_resp_setvar.text = "0" + + it_respcondition2 = ET.SubElement(it_res, "respcondition") + it_respcondition_var2 = ET.SubElement(it_respcondition2, "conditionvar") + ET.SubElement(it_respcondition_var2, "varlt", {"respident": "D2L_Incorrect"}) + it_resp_setvar2 = ET.SubElement(it_respcondition2, "setvar", {"varname": "que_score", "action": "Set"}) + it_resp_setvar2.text = "D2L_Correct" + it_resp_setvar3 = ET.SubElement(it_respcondition2, "setvar", {"varname": "que_score", "action": "Subtract"}) + it_resp_setvar3.text = "D2L_Incorrect" + + if question.feedback: + self.generate_feedback(it, question_ident, question.feedback) diff --git a/api/formats/scorm/xml_builders/multi_select.py b/api/formats/scorm/xml_builders/multi_select.py new file mode 100644 index 0000000..2469359 --- /dev/null +++ b/api/formats/scorm/xml_builders/multi_select.py @@ -0,0 +1,77 @@ +import xml.etree.cElementTree as ET + +from ..xmlcdata import CDATA + + +class MultiSelectBuilder: + def generate_multi_select(self, it, question_ident, question): + self.itemetadata(it, "Multi-Select", question) + self.itemproc_extension(it) + + question_lid = question_ident + "_LID" + question_ident_answer = question_ident + "_A" + question_ident_feedback = question_ident + "_IF" + + it_pre = ET.SubElement(it, "presentation") + it_pre_flow = ET.SubElement(it_pre, "flow") + it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") + + multiple_select = question.get_multiple_select() + it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) + question_text = question.text + it_pre_flow_mat_text.append(CDATA(question_text)) + + it_pre_flow_res = ET.SubElement(it_pre_flow, "response_extension") + it_pre_flow_res_display_style = ET.SubElement(it_pre_flow_res, "d2l_2p0:display_style") + it_pre_flow_res_display_style.text = "2" + it_pre_flow_res_enumeration = ET.SubElement(it_pre_flow_res, "d2l_2p0:enumeration") + it_pre_flow_res_enumeration.text = str(multiple_select.enumeration) if multiple_select.enumeration else "4" + it_pre_flow_res_grading_type = ET.SubElement(it_pre_flow_res, "d2l_2p0:grading_type") + it_pre_flow_res_grading_type.text = "2" + + it_pre_flow_lid = ET.SubElement(it_pre_flow, "response_lid", {"ident": question_lid, "rcardinality": "Multiple"}) + it_pre_flow_lid_render_choice = ET.SubElement( + it_pre_flow_lid, "render_choice", {"shuffle": ("yes" if multiple_select.randomize else "no")} + ) + + if question.hint: + self.generate_hint(it, question.hint) + + it_res = ET.SubElement(it, "resprocessing") + it_out = ET.SubElement(it_res, "outcomes") + ET.SubElement( + it_out, + "decvar", + {"vartype": "Integer", "defaultval": "0", "varname": "que_score", "minvalue": "0", "maxvalue": "100"}, + ) + ET.SubElement(it_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "D2L_Correct", "minvalue": "0"}) + ET.SubElement(it_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "D2L_Incorrect", "minvalue": "0"}) + + if question.feedback: + self.generate_feedback(it, question_ident, question.feedback) + + ms_index = 1 + for ms_answer in multiple_select.get_multiple_select_answers(): + flow = ET.SubElement(it_pre_flow_lid_render_choice, "flow_label", {"class": "Block"}) + response_label = ET.SubElement(flow, "response_label", {"ident": question_ident_answer + str(ms_index)}) + flow_mat = ET.SubElement(response_label, "flow_mat") + material = ET.SubElement(flow_mat, "material") + mattext = ET.SubElement(material, "mattext", {"texttype": "text/html"}) + mattext.text = ms_answer.answer + + it_res_con = ET.SubElement(it_res, "respcondition", {"title": "Response Condition", "continue": "yes"}) + it_res_con_var = ET.SubElement(it_res_con, "conditionvar") + it_res_con_var_equal = ET.SubElement(it_res_con_var, "varequal", {"respident": question_lid}) + it_res_con_var_equal.text = question_ident_answer + str(ms_index) + if ms_answer.is_correct is True: + ET.SubElement(it_res_con, "setvar", {"varname": "D2L_Correct", "action": "Add"}) + else: + ET.SubElement(it_res_con, "setvar", {"varname": "D2L_Incorrect", "action": "Add"}) + + if ms_answer.answer_feedback: + self.generate_feedback(it, question_ident_feedback + str(ms_index), ms_answer.answer_feedback) + ms_index += 1 + + it_res_con = ET.SubElement(it_res, "respcondition") + it_res_set_var = ET.SubElement(it_res_con, "setvar", {"varname": "que_score", "action": "Set"}) + it_res_set_var.text = "D2L_Correct" diff --git a/api/formats/scorm/xml_builders/multiple_choice.py b/api/formats/scorm/xml_builders/multiple_choice.py new file mode 100644 index 0000000..5a4c95c --- /dev/null +++ b/api/formats/scorm/xml_builders/multiple_choice.py @@ -0,0 +1,66 @@ +import xml.etree.cElementTree as ET + +from ..xmlcdata import CDATA + + +class MultipleChoiceBuilder: + def generate_multiple_choice(self, it, question_ident, question): + self.itemetadata(it, "Multiple Choice", question) + self.itemproc_extension(it) + question_lid = question_ident + "_LID" + question_ident_answer = question_ident + "_A" + question_ident_feedback = question_ident + "_IF" + + it_pre = ET.SubElement(it, "presentation") + it_pre_flow = ET.SubElement(it_pre, "flow") + it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") + + multiple_choice = question.get_multiple_choice() + it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) + question_text = question.text + it_pre_flow_mat_text.append(CDATA(question_text)) + + it_pre_flow_res = ET.SubElement(it_pre_flow, "response_extension") + it_pre_flow_res_display_style = ET.SubElement(it_pre_flow_res, "d2l_2p0:display_style") + it_pre_flow_res_display_style.text = "2" + it_pre_flow_res_enumeration = ET.SubElement(it_pre_flow_res, "d2l_2p0:enumeration") + it_pre_flow_res_enumeration.text = str(multiple_choice.enumeration) if multiple_choice.enumeration else "4" + it_pre_flow_res_grading_type = ET.SubElement(it_pre_flow_res, "d2l_2p0:grading_type") + it_pre_flow_res_grading_type.text = "0" + it_pre_flow_lid = ET.SubElement(it_pre_flow, "response_lid", {"ident": question_lid, "rcardinality": "Multiple"}) + it_pre_flow_lid_render_choice = ET.SubElement( + it_pre_flow_lid, "render_choice", {"shuffle": ("yes" if multiple_choice.randomize else "no")} + ) + + if question.hint: + self.generate_hint(it, question.hint) + + it_res = ET.SubElement(it, "resprocessing") + + if question.feedback: + self.generate_feedback(it, question_ident, question.feedback) + + mc_answer_index = 1 + for mc_answer in multiple_choice.get_multiple_choice_answers(): + flow = ET.SubElement(it_pre_flow_lid_render_choice, "flow_label", {"class": "Block"}) + response_label = ET.SubElement(flow, "response_label", {"ident": question_ident_answer + str(mc_answer_index)}) + flow_mat = ET.SubElement(response_label, "flow_mat") + material = ET.SubElement(flow_mat, "material") + mattext = ET.SubElement(material, "mattext", {"texttype": "text/html"}) + mattext.append(CDATA(mc_answer.answer)) + + it_res_con = ET.SubElement(it_res, "respcondition", {"title": "Response Condition" + str(mc_answer_index)}) + it_res_con_var = ET.SubElement(it_res_con, "conditionvar") + it_res_con_var_equal = ET.SubElement(it_res_con_var, "varequal", {"respident": question_lid}) + it_res_con_var_equal.text = question_ident_answer + str(mc_answer_index) + it_res_set_var = ET.SubElement(it_res_con, "setvar", {"action": "Set"}) + it_res_set_var.text = str(mc_answer.weight) if mc_answer.weight else "0.0000" + ET.SubElement( + it_res_con, + "displayfeedback", + {"feedbacktype": "Response", "linkrefid": question_ident_feedback + str(mc_answer_index)}, + ) + + if mc_answer.answer_feedback: + self.generate_feedback(it, question_ident_feedback + str(mc_answer_index), mc_answer.answer_feedback) + mc_answer_index += 1 diff --git a/api/formats/scorm/xml_builders/ordering.py b/api/formats/scorm/xml_builders/ordering.py new file mode 100644 index 0000000..73dfc40 --- /dev/null +++ b/api/formats/scorm/xml_builders/ordering.py @@ -0,0 +1,86 @@ +import xml.etree.cElementTree as ET + +from ..xmlcdata import CDATA + + +class OrderingBuilder: + def generate_ordering(self, it, question_ident, question): + self.itemetadata(it, "Ordering", question) + self.itemproc_extension(it) + + question_o = question_ident + "_O" + question_ident_feedback = question_ident + "_IF" + + it_pre = ET.SubElement(it, "presentation") + it_pre_flow = ET.SubElement(it_pre, "flow") + + it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") + it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) + question_text = question.text + it_pre_flow_mat_text.append(CDATA(question_text)) + + it_pre_flow_res_ext = ET.SubElement(it_pre_flow, "response_extension") + it_pre_flow_res_ext_grading = ET.SubElement(it_pre_flow_res_ext, "d2l_2p0:grading_type") + grading_type = 2 + it_pre_flow_res_ext_grading.append(CDATA(grading_type)) + + it_pre_flow_res_grp = ET.SubElement(it_pre_flow, "response_grp", {"ident": question_o, "rcardinality": "Ordered"}) + it_pre_flow_res_grp_render = ET.SubElement(it_pre_flow_res_grp, "render_choice", {"shuffle": "yes"}) + it_pre_flow_res_grp_render_flow = ET.SubElement(it_pre_flow_res_grp_render, "flow_label", {"class": "Block"}) + + if question.hint: + self.generate_hint(it, question.hint) + + it_res = ET.SubElement(it, "resprocessing") + it_out = ET.SubElement(it_res, "outcomes") + ET.SubElement(it_out, "decvar", {"maxvalue": "100", "minvalue": "0", "varname": "D2L_Correct", "defaultval": "0", "vartype": "Integer"}) + ET.SubElement(it_out, "decvar", {"minvalue": "0", "varname": "D2L_Incorrect", "defaultval": "0", "vartype": "Integer"}) + ET.SubElement(it_out, "decvar", {"minvalue": "0", "varname": "que_score", "defaultval": "0", "vartype": "Integer"}) + + it_res_con_other = ET.SubElement(it_res, "respcondition") + it_res_con_other_var = ET.SubElement(it_res_con_other, "conditionvar") + ET.SubElement(it_res_con_other_var, "other") + it_res_con_other_setvar = ET.SubElement(it_res_con_other, "setvar", {"varname": "que_score", "action": "Set"}) + it_res_con_other_setvar.text = "D2L_Correct" + + if question.feedback: + self.generate_feedback(it, question_ident, question.feedback) + + ord_index = 1 + for ord in question.get_orderings(): + ident_num = question_o + str(ord_index) + it_pre_flow_res_grp_render_flow_res = ET.SubElement( + it_pre_flow_res_grp_render_flow, "response_label", {"ident": ident_num} + ) + it_pre_flow_res_grp_render_flow_res_flow = ET.SubElement(it_pre_flow_res_grp_render_flow_res, "flow_mat") + it_pre_flow_res_grp_render_flow_res_flow_mat = ET.SubElement( + it_pre_flow_res_grp_render_flow_res_flow, "material" + ) + it_pre_flow_res_grp_render_flow_res_flow_mat_text = ET.SubElement( + it_pre_flow_res_grp_render_flow_res_flow_mat, "mattext", {"texttype": "text/html"} + ) + question_text = ord.text + it_pre_flow_res_grp_render_flow_res_flow_mat_text.append(CDATA(question_text)) + + it_res_con_correct = ET.SubElement(it_res, "respcondition", {"title": "Correct Condition"}) + it_res_con_correct_var = ET.SubElement(it_res_con_correct, "conditionvar") + it_res_con_correct_var_equal = ET.SubElement(it_res_con_correct_var, "varequal", {"respident": ident_num}) + it_res_con_correct_var_equal.text = str(ord_index) + it_res_con_correct_setvar = ET.SubElement(it_res_con_correct, "setvar", {"varname": "D2L_Correct", "action": "Add"}) + it_res_con_correct_setvar.text = str(1) + + it_res_con_incorrect = ET.SubElement(it_res, "respcondition", {"title": "Incorrect Condition"}) + it_res_con_incorrect_var = ET.SubElement(it_res_con_incorrect, "conditionvar") + it_res_con_incorrect_var_not = ET.SubElement(it_res_con_incorrect_var, "not") + it_res_con_incorrect_var_not_equal = ET.SubElement( + it_res_con_incorrect_var_not, "varequal", {"respident": ident_num} + ) + it_res_con_incorrect_var_not_equal.text = str(ord_index) + it_res_con_incorrect_setvar = ET.SubElement( + it_res_con_incorrect, "setvar", {"varname": "D2L_Incorrect", "action": "Add"} + ) + it_res_con_incorrect_setvar.text = str(1) + + if ord.ord_feedback: + self.generate_feedback(it, question_ident_feedback + str(ord_index), ord.ord_feedback) + ord_index += 1 diff --git a/api/formats/scorm/xml_builders/true_false.py b/api/formats/scorm/xml_builders/true_false.py new file mode 100644 index 0000000..3f9be58 --- /dev/null +++ b/api/formats/scorm/xml_builders/true_false.py @@ -0,0 +1,72 @@ +import xml.etree.cElementTree as ET + +from ..xmlcdata import CDATA + + +class TrueFalseBuilder: + def generate_true_false(self, it, question_ident, question): + self.itemetadata(it, "True/False", question) + self.itemproc_extension(it) + + question_lid = question_ident + "_LID" + question_ident_answer = question_ident + "_A" + question_ident_feedback = question_ident + "_IF" + + it_pre = ET.SubElement(it, "presentation") + it_pre_flow = ET.SubElement(it_pre, "flow") + it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") + + true_false = question.get_true_false() + it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) + question_text = question.text + it_pre_flow_mat_text.append(CDATA(question_text)) + + it_pre_flow_res = ET.SubElement(it_pre_flow, "response_extension") + it_pre_flow_res_display_style = ET.SubElement(it_pre_flow_res, "d2l_2p0:display_style") + it_pre_flow_res_display_style.text = "2" + it_pre_flow_res_enumeration = ET.SubElement(it_pre_flow_res, "d2l_2p0:enumeration") + it_pre_flow_res_enumeration.text = str(true_false.enumeration) if true_false.enumeration else "4" + it_pre_flow_res_grading_type = ET.SubElement(it_pre_flow_res, "d2l_2p0:grading_type") + it_pre_flow_res_grading_type.text = "0" + + it_pre_flow_lid = ET.SubElement(it_pre_flow, "response_lid", {"ident": question_lid, "rcardinality": "Single"}) + it_pre_flow_lid_render_choice = ET.SubElement(it_pre_flow_lid, "render_choice", {"shuffle": "no"}) + + it_res = ET.SubElement(it, "resprocessing") + + if question.feedback: + self.generate_feedback(it, question_ident, question.feedback) + + tf_index = 0 + answer_text = ["True", "False"] + while tf_index < 2: + flow = ET.SubElement(it_pre_flow_lid_render_choice, "flow_label", {"class": "Block"}) + response_label = ET.SubElement(flow, "response_label", {"ident": question_ident_answer + str(tf_index)}) + flow_mat = ET.SubElement(response_label, "flow_mat") + material = ET.SubElement(flow_mat, "material") + mattext = ET.SubElement(material, "mattext", {"texttype": "text/plain"}) + mattext.text = answer_text[tf_index] + + it_res_con = ET.SubElement(it_res, "respcondition", {"title": "Response Condition" + str(tf_index)}) + it_res_con_var = ET.SubElement(it_res_con, "conditionvar") + it_res_con_var_equal = ET.SubElement(it_res_con_var, "varequal", {"respident": question_lid}) + it_res_con_var_equal.text = question_ident_answer + str(tf_index) + it_res_set_var = ET.SubElement(it_res_con, "setvar", {"action": "Set"}) + + if tf_index == 0: + current_weight = true_false.true_weight + current_feedback = true_false.true_feedback + else: + current_weight = true_false.false_weight + current_feedback = true_false.false_feedback + + it_res_set_var.text = str(current_weight) if current_weight else "0.0000" + ET.SubElement( + it_res_con, + "displayfeedback", + {"feedbacktype": "Response", "linkrefid": question_ident_feedback + str(tf_index)}, + ) + + if current_feedback: + self.generate_feedback(it, question_ident_feedback + str(tf_index), current_feedback) + tf_index += 1 diff --git a/api/formats/scorm/xml_builders/written_response.py b/api/formats/scorm/xml_builders/written_response.py new file mode 100644 index 0000000..64dbda6 --- /dev/null +++ b/api/formats/scorm/xml_builders/written_response.py @@ -0,0 +1,56 @@ +import xml.etree.cElementTree as ET + +from ..xmlcdata import CDATA + + +class WrittenResponseBuilder: + def generate_written_response(self, it, question_ident, question): + self.itemetadata(it, "Long Answer", question) + self.itemproc_extension(it) + + question_ident_str = question_ident + "_STR" + question_ident_la = question_ident + "_LA" + + it_pre = ET.SubElement(it, "presentation") + it_pre_flow = ET.SubElement(it_pre, "flow") + + written_response = question.get_written_response() + + it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") + it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) + question_text = question.text + it_pre_flow_mat_text.append(CDATA(question_text)) + + it_pre_flow_mat_res_ext = ET.SubElement(it_pre_flow, "response_extension") + it_pre_flow_mat_res_ext_sign = ET.SubElement(it_pre_flow_mat_res_ext, "d2l_2p0:has_signed_comments") + it_pre_flow_mat_res_ext_sign.append(CDATA("no")) + it_pre_flow_mat_res_ext_editor = ET.SubElement(it_pre_flow_mat_res_ext, "d2l_2p0:has_htmleditor") + it_pre_flow_mat_res_ext_editor.append(CDATA("no")) + + it_pre_flow_mat_res_str = ET.SubElement( + it_pre_flow, "response_str", {"rcardinality": "Multiple", "ident": question_ident_str} + ) + it_pre_flow_mat_res_str_render = ET.SubElement( + it_pre_flow_mat_res_str, "render_fib", {"fibtype": "String", "prompt": "Box", "columns": "100", "rows": "15"} + ) + it_pre_flow_mat_res_str_render_label = ET.SubElement( + it_pre_flow_mat_res_str_render, "response_label", {"ident": question_ident_la} + ) + it_pre_flow_mat_res_str_render_label_mat = ET.SubElement(it_pre_flow_mat_res_str_render_label, "material") + ET.SubElement(it_pre_flow_mat_res_str_render_label_mat, "mattext", {"texttype": "text/html"}) + + if question.hint: + self.generate_hint(it, question.hint) + if question.feedback: + self.generate_feedback(it, question_ident, question.feedback) + it_init_text = ET.SubElement(it, "initial_text") + it_init_text_mat = ET.SubElement(it_init_text, "initial_text_material") + it_init_text_mat_flow = ET.SubElement(it_init_text_mat, "flow_mat") + it_init_text_mat_flow_mat = ET.SubElement(it_init_text_mat_flow, "material") + ET.SubElement(it_init_text_mat_flow_mat, "mattext", {"texttype": "text/html"}) + it_ans = ET.SubElement(it, "answer_key") + it_ans_mat = ET.SubElement(it_ans, "answer_key_material") + it_ans_mat_flow = ET.SubElement(it_ans_mat, "flow_mat") + it_ans_mat_flow_mat = ET.SubElement(it_ans_mat_flow, "material") + it_ans_mat_flow_mat_text = ET.SubElement(it_ans_mat_flow_mat, "mattext", {"texttype": "text/html"}) + it_ans_mat_flow_mat_text.append(CDATA(written_response.answer_key)) diff --git a/api/scorm/xmlcdata.py b/api/formats/scorm/xmlcdata.py similarity index 81% rename from api/scorm/xmlcdata.py rename to api/formats/scorm/xmlcdata.py index 2971d53..c75bdfc 100644 --- a/api/scorm/xmlcdata.py +++ b/api/formats/scorm/xmlcdata.py @@ -16,8 +16,7 @@ def CDATA(text=None): def _serialize_xml2(write, elem, encoding, qnames, namespaces, orig=ET._serialize_xml): if elem.tag == '![CDATA[': - write("\n<%s%s]]>\n" % \ - (elem.tag, elem.text.encode(encoding, "xmlcharrefreplace"))) + write("\n<%s%s]]>\n" % (elem.tag, elem.text.encode(encoding, "xmlcharrefreplace"))) return return orig(write, elem, encoding, qnames, namespaces) @@ -36,8 +35,6 @@ def _serialize_xml3(write, elem, qnames, namespaces, return orig(write, elem, qnames, namespaces) if six.PY3: - ET._serialize_xml = \ - ET._serialize['xml'] = _serialize_xml3 + ET._serialize_xml = ET._serialize["xml"] = _serialize_xml3 elif six.PY2: - ET._serialize_xml = \ - ET._serialize['xml'] = _serialize_xml2 \ No newline at end of file + ET._serialize_xml = ET._serialize["xml"] = _serialize_xml2 \ No newline at end of file diff --git a/api/models.py b/api/models.py index 8e4ee64..6ac7d31 100644 --- a/api/models.py +++ b/api/models.py @@ -6,8 +6,9 @@ # import pypandoc from datetime import datetime -from .scorm.XmlWriter import XmlWriter -from .scorm.manifest import ManifestEntity, ManifestResourceEntity +from .formats.scorm.scorm_writer import ScormWriter +from .formats.scorm.manifest_builder import build_manifest +from .formats.scorm.manifest import ManifestEntity, ManifestResourceEntity from xml.dom.minidom import parseString import xml.etree.cElementTree as ET @@ -28,6 +29,7 @@ import logging +import traceback newlogger = logging.getLogger(__name__) from .logging.logging_adapter import FilenameLoggingAdapter @@ -110,11 +112,11 @@ def create_xml_files(self): logger = FilenameLoggingAdapter(newlogger, {'filename': str(self.id)}) try: ql_obj = QuestionLibrary.objects.filter(id=self.id).first() - parsed_xml = XmlWriter(ql_obj) + parsed_xml = ScormWriter(ql_obj) manifest_entity = ManifestEntity() manifest_resource_entity = ManifestResourceEntity('res_question_library', 'webcontent', 'd2lquestionlibrary', 'questiondb.xml', 'Question Library') manifest_entity.add_resource(manifest_resource_entity) - manifest = parsed_xml.create_manifest(manifest_entity, self.folder_path) + manifest = build_manifest(manifest_entity) parsed_imsmanifest = ET.tostring(manifest.getroot(), encoding='utf-8', xml_declaration=True).decode() parsed_imsmanifest = parseString(parsed_imsmanifest) parsed_imsmanifest = parsed_imsmanifest.toprettyxml(indent="\t") @@ -122,18 +124,28 @@ def create_xml_files(self): self.save() logger.info("imsmanifest String Created") except Exception as e: - logger.error("imsmanifest String Failed") - self.error = "imsmanifest String Failed" + logger.error(f"imsmanifest String Failed: {e}") + self.error = f"imsmanifest String Failed: {e}\n{traceback.format_exc()}" self.save() + return try: + if "parsed_xml" not in locals(): + raise RuntimeError("ScormWriter failed; questiondb_string not generated.") questiondb_string = parsed_xml.questiondb_string media_folder = self.media_folder if self.media_folder != None else f'./assessment-assets/{self.filtered_main_title}/' img_elements = re.findall(r"\", questiondb_string, re.MULTILINE) for idx, img in enumerate(img_elements): img_src = re.findall(r"src=\"(.*?)\"", img, re.MULTILINE) + if not img_src: + continue + if ";base64," not in img_src[0]: + # Skip non-base64 images (external paths or placeholders) + continue base64_img = img_src[0].split(';base64,') + if len(base64_img) < 2: + continue img_string = base64_img[1] img_ext = base64_img[0].split("/")[1] image_data = base64.b64decode(img_string) @@ -162,10 +174,17 @@ def create_xml_files(self): logger.info("QuestionDB String Created") except Exception as e: - logger.error("QuestionDB String Failed") - - self.error = "QuestionDB String Failed" + logger.error(f"QuestionDB String Failed: {e}") + self.error = f"QuestionDB String Failed: {e}\n{traceback.format_exc()}" self.save() + return + + if not self.questiondb_string: + if not self.error: + self.error = "XML files Failed: questiondb_string is empty or missing." + self.save() + logger.error("XML files Failed: questiondb_string is empty or missing.") + return try: questiondb_file = ContentFile(self.questiondb_string, name="questiondb.xml") @@ -176,8 +195,8 @@ def create_xml_files(self): # print(datetime.now().strftime("%H:%M:%S"), "imsmanifest.xml and questiondb.xml created!") except Exception as e: - logger.error("XML files Failed") - self.error = "XML files Failed" + logger.error(f"XML files Failed: {e}") + self.error = f"XML files Failed: {e}\n{traceback.format_exc()}" self.save() def zip_files(self): @@ -197,9 +216,8 @@ def zip_files(self): logger.info("ZIP file Created") except Exception as e: - logger.error("ZIP file Failed") - - self.error = "ZIP file Failed" + logger.error(f"ZIP file Failed: {e}") + self.error = f"ZIP file Failed: {e}" self.save() def create_zip_file_package(self): @@ -213,8 +231,8 @@ def create_zip_file_package(self): self.save() logger.info("ZIP file with JSON package Created") except Exception as e: - logger.error("ZIP file with JSON package Failed") - self.error = "ZIP file Failed" + logger.error(f"ZIP file with JSON package Failed: {e}") + self.error = f"ZIP file Failed: {e}" self.save() def cleanup(self): diff --git a/api/pipelines/__init__.py b/api/pipelines/__init__.py new file mode 100644 index 0000000..865de7c --- /dev/null +++ b/api/pipelines/__init__.py @@ -0,0 +1 @@ +# Pipeline orchestration layer. diff --git a/api/pipelines/docx_to_json.py b/api/pipelines/docx_to_json.py new file mode 100644 index 0000000..ee80b9e --- /dev/null +++ b/api/pipelines/docx_to_json.py @@ -0,0 +1,36 @@ +import logging +from api.pipelines.ws_pipeline import Process, run_pipeline +from api.pipelines.response_payload import build_response_payload + +logger = logging.getLogger(__name__) + + +class DocxToJsonError(Exception): + def __init__(self, message, process=None): + super().__init__(message) + self.process = process + + +def build_docx_to_json(questionlibrary): + """ + Run the DOCX pipeline and return the QuestionLibrary instance. + """ + pipeline = Process(questionlibrary) + try: + run_pipeline(pipeline) + except Exception as exc: + raise DocxToJsonError(str(exc), process=pipeline) + return pipeline.questionlibrary + + +def docx_to_json(questionlibrary, logger_instance=None): + """ + High-level function to convert DOCX to JSON. + Returns the JSON payload and QuestionLibrary instance. + """ + log = logger_instance or logger + log.info(f"[{questionlibrary.id}] DOCX to JSON conversion started") + ql_instance = build_docx_to_json(questionlibrary) + json_data = build_response_payload(ql_instance) + log.info(f"[{ql_instance.id}] DOCX to JSON conversion completed") + return json_data, ql_instance diff --git a/api/pipelines/json_to_docx.py b/api/pipelines/json_to_docx.py new file mode 100644 index 0000000..0a69311 --- /dev/null +++ b/api/pipelines/json_to_docx.py @@ -0,0 +1,239 @@ +import base64 +import glob +import os +import re +import uuid +import subprocess +import logging +from os import path + +from django.conf import settings +from django.core.files import File +from django.http import FileResponse + +from api.serializers import QuestionLibraryPackageSerializer +from api.formats.scorm.scorm_formatter import ScormFormatter + +logger = logging.getLogger(__name__) + +class JsonToDocxError(Exception): + def __init__(self, errors): + super().__init__("JSON to DOCX validation failed") + self.errors = errors + + +def build_docx_from_json(json_data, logger_instance=None): + """ + High-level function to convert JSON to DOCX file. + Returns a FileResponse and QuestionLibrary instance. + """ + log = logger_instance or logger + + payload = json_data.get("data", json_data) + ql_serializer = QuestionLibraryPackageSerializer(data=payload) + if not ql_serializer.is_valid(): + raise JsonToDocxError(ql_serializer.errors) + + ql_instance = ql_serializer.save() + ql_instance.filter_main_title() + ql_instance.folder_path = settings.MEDIA_ROOT + str(ql_instance.id) + ql_instance.image_path = ql_instance.folder_path + settings.MEDIA_URL + ql_instance.create_directory() + ql_instance.save() + + formatter = ScormFormatter() + markdown_text = formatter.format_to_markdown(ql_instance) + + image_counter = 0 + base64_pattern = r']*?)src=["\'](data:image/([^;]+);base64,([^"\']+))["\']([^>]*?)>' + + def replace_base64_with_file(match): + nonlocal image_counter + before_src = match.group(1) + image_type = match.group(3) + base64_data = match.group(4) + after_src = match.group(5) + + try: + image_data = base64.b64decode(base64_data) + ext_map = { + "png": "png", + "jpeg": "jpg", + "jpg": "jpg", + "gif": "gif", + "svg+xml": "svg", + "webp": "webp", + } + ext = ext_map.get(image_type.lower(), "png") + image_filename = f"image_{image_counter}_{uuid.uuid4().hex[:8]}.{ext}" + image_path = path.join(ql_instance.folder_path, image_filename) + + with open(image_path, "wb") as img_file: + img_file.write(image_data) + + image_counter += 1 + log.info( + f"Extracted base64 image to file: {image_filename} ({len(image_data)} bytes)" + ) + + alt_match = re.search(r'alt=["\']([^"\']*)["\']', before_src + after_src) + alt_text = alt_match.group(1) if alt_match else "image" + markdown_image = f"![{alt_text}]({image_filename})" + log.debug(f"Replacing base64 img tag with markdown: {markdown_image}") + return markdown_image + except Exception as e: + log.error(f"Error extracting base64 image: {str(e)}") + return match.group(0) + + markdown_text = re.sub(base64_pattern, replace_base64_with_file, markdown_text) + log.info(f"Extracted {image_counter} base64 images to files") + + if ql_instance.main_title: + filename = ql_instance.main_title.strip() + filename = re.sub(r'[<>:"/\\|?*]', "", filename) + filename = re.sub(r"\s+", "_", filename) + filename = filename[:100] + if not filename: + filename = ql_instance.filtered_main_title + else: + filename = ql_instance.filtered_main_title + + docx_filename = f"{filename}.docx" + docx_path = path.join(ql_instance.folder_path, docx_filename) + + current_file_dir = os.path.dirname(os.path.abspath(__file__)) + base_dir = os.path.dirname(os.path.dirname(current_file_dir)) + mdblockquote_path = os.path.abspath( + os.path.join(base_dir, "pandoc", "pandoc-filters", "mdblockquote.lua") + ) + emptypara_path = os.path.abspath( + os.path.join(base_dir, "pandoc", "pandoc-filters", "emptypara.lua") + ) + log.debug( + f"Lua filter paths: mdblockquote={mdblockquote_path}, emptypara={emptypara_path}" + ) + + temp_md_path = path.join(ql_instance.folder_path, "temp_markdown.md") + with open(temp_md_path, "w", encoding="utf-8") as f: + f.write(markdown_text) + + file_refs = re.findall(r'!\[.*?\]\((image_\d+_[^)]+)\)', markdown_text) + log.info(f"Found {len(file_refs)} image file references in markdown") + image_files = glob.glob(path.join(ql_instance.folder_path, "image_*.*")) + image_info = [] + total_image_size = 0 + for img_file in image_files: + if path.exists(img_file): + img_size = path.getsize(img_file) + total_image_size += img_size + img_size_mb = img_size / (1024 * 1024) + image_info.append( + f"{path.basename(img_file)} ({img_size_mb:.2f} MB, {img_size} bytes)" + ) + if len(image_files) > 0: + log.info(f"Found {len(image_files)} image files in folder:") + for info in image_info: + log.info(f" - {info}") + log.info( + f"Total image size: {total_image_size / (1024 * 1024):.2f} MB ({total_image_size} bytes)" + ) + log.info(f"Markdown file created at: {temp_md_path}") + + original_cwd = os.getcwd() + try: + os.chdir(ql_instance.folder_path) + temp_md_rel_path = "temp_markdown.md" + docx_output_name = os.path.basename(docx_path) + log.info( + f"Converting markdown with image file references to DOCX (working dir: {os.getcwd()})" + ) + existing_images = glob.glob("image_*.*") + log.info(f"Images in working directory before Pandoc: {existing_images}") + with open(temp_md_rel_path, "r", encoding="utf-8") as f: + md_content = f.read() + image_refs_in_md = re.findall(r'!\[.*?\]\((image_\d+_[^)]+)\)', md_content) + log.info(f"Image references found in markdown file: {image_refs_in_md}") + pandoc_cmd = [ + "pandoc", + temp_md_rel_path, + "-f", + "markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars", + "-t", + "docx+empty_paragraphs", + "-o", + docx_output_name, + "--no-highlight", + "--preserve-tabs", + "--wrap=preserve", + "--indent=false", + "--mathml", + "--ascii", + "--lua-filter=" + mdblockquote_path, + "--lua-filter=" + emptypara_path, + ] + log.info(f"Running pandoc command: {' '.join(pandoc_cmd)}") + result = subprocess.run( + pandoc_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if result.returncode != 0: + log.error(f"Pandoc failed (exit {result.returncode}): {result.stderr}") + raise Exception(f"Pandoc failed: {result.stderr}") + if result.stderr: + log.warning(f"Pandoc warnings: {result.stderr}") + log.info("Pandoc markdown to DOCX conversion completed") + finally: + os.chdir(original_cwd) + + try: + if path.exists(temp_md_path): + from os import remove + + remove(temp_md_path) + + image_files = ( + glob.glob(path.join(ql_instance.folder_path, "image_*.png")) + + glob.glob(path.join(ql_instance.folder_path, "image_*.jpg")) + + glob.glob(path.join(ql_instance.folder_path, "image_*.jpeg")) + + glob.glob(path.join(ql_instance.folder_path, "image_*.gif")) + + glob.glob(path.join(ql_instance.folder_path, "image_*.svg")) + + glob.glob(path.join(ql_instance.folder_path, "image_*.webp")) + ) + for img_file in image_files: + try: + if path.exists(img_file): + os.remove(img_file) + except Exception as e: + log.warning( + f"Could not remove temporary image file {img_file}: {str(e)}" + ) + except Exception: + pass + + with open(docx_path, "rb") as f: + ql_instance.temp_file.save(docx_filename, File(f), save=True) + + file_response = FileResponse(ql_instance.temp_file) + file_response["Content-Disposition"] = f'attachment; filename="{docx_filename}"' + + docx_size_bytes = path.getsize(docx_path) + docx_size_mb = docx_size_bytes / (1024 * 1024) + log.info( + f"[{ql_instance.id}] JSON to DOCX conversion completed - DOCX size: {docx_size_mb:.2f} MB ({docx_size_bytes} bytes)" + ) + + return file_response, ql_instance + + +def json_to_docx(json_data, logger_instance=None): + """ + High-level function to convert JSON to DOCX file. + Returns a FileResponse and QuestionLibrary instance. + """ + log = logger_instance or logger + log.info("JSON to DOCX conversion started") + file_response, ql_instance = build_docx_from_json(json_data, log) + log.info(f"[{ql_instance.id}] JSON to DOCX conversion completed") + return file_response, ql_instance diff --git a/api/pipelines/json_to_scorm.py b/api/pipelines/json_to_scorm.py new file mode 100644 index 0000000..9cdbf28 --- /dev/null +++ b/api/pipelines/json_to_scorm.py @@ -0,0 +1,66 @@ +from django.conf import settings +from django.http import FileResponse +from api.serializers import QuestionLibraryPackageSerializer +import logging + +logger = logging.getLogger(__name__) + + +class JsonToScormError(Exception): + def __init__(self, errors): + super().__init__("JSON to SCORM validation failed") + self.errors = errors + + +def build_scorm_from_json(json_data): + """ + Build SCORM ZIP from JSON data. + Returns the QuestionLibrary instance with zip_file created. + """ + payload = json_data.get("data", json_data) + ql_serializer = QuestionLibraryPackageSerializer(data=payload) + if not ql_serializer.is_valid(): + raise JsonToScormError(ql_serializer.errors) + + ql_instance = ql_serializer.save() + ql_instance.filter_main_title() + ql_instance.folder_path = settings.MEDIA_ROOT + str(ql_instance.id) + ql_instance.image_path = ql_instance.folder_path + settings.MEDIA_URL + ql_instance.create_directory() + ql_instance.save() + + ql_instance.create_xml_files() + missing_files = [] + if not ql_instance.imsmanifest_file: + missing_files.append("imsmanifest_file") + if not ql_instance.questiondb_file: + missing_files.append("questiondb_file") + if missing_files: + detail = ql_instance.error or "XML generation failed." + raise JsonToScormError({"xml_files": [detail], "missing_files": missing_files}) + + ql_instance.zip_files() + + if not ql_instance.zip_file: + detail = ql_instance.error or "Zip file was not created." + raise JsonToScormError({"zip_file": [detail]}) + + return ql_instance + + +def json_to_scorm(json_data, logger_instance=None): + """ + High-level function to convert JSON to SCORM ZIP file. + Returns a FileResponse and QuestionLibrary instance. + """ + log = logger_instance or logger + log.info("JSON to SCORM conversion started") + ql_instance = build_scorm_from_json(json_data) + + file_name = f"{ql_instance.filtered_main_title}.zip" + file_response = FileResponse(ql_instance.zip_file) + file_response['Content-Disposition'] = f'attachment; filename="{file_name}"' + + log.info(f"[{ql_instance.id}] JSON to SCORM conversion completed") + + return file_response, ql_instance diff --git a/api/pipelines/response_payload.py b/api/pipelines/response_payload.py new file mode 100644 index 0000000..014cf9e --- /dev/null +++ b/api/pipelines/response_payload.py @@ -0,0 +1,141 @@ +import copy +import re +import socket + +from django.conf import settings + +from api.serializers import JsonResponseSerializer, count_errors +from api.formats.docx.process_helper import html_to_plain, trim_text + + +def build_response_payload(questionlibrary, preview=False): + count_errors(questionlibrary) + serializer = JsonResponseSerializer(questionlibrary) + json_data = serializer.data + json_data["total_question_errors"] = str(questionlibrary.total_question_errors or 0) + json_data["total_document_errors"] = str(questionlibrary.total_document_errors or 0) + + questionlibrary.json_data = json_data + questionlibrary.save(update_fields=["json_data"]) + + if preview: + return _apply_preview_transform(copy.deepcopy(json_data), questionlibrary) + + return json_data + + +def build_status_payload(status, statustext, data="", process=None, questionlibrary=None): + if process: + payload = process.sendformat(status, statustext, data) + else: + payload = { + "hostname": socket.gethostname(), + "version": settings.APP_VERSION, + "status": status, + "statustext": statustext, + "images_count": "0", + "section_count": "0", + "questions_count": "0", + "endanswer_count": "0", + "question_info_count": "0", + "question_warning_count": "0", + "question_error_count": "0", + "data": data, + } + + if questionlibrary: + total_question_errors = getattr(questionlibrary, "total_question_errors", 0) or 0 + total_document_errors = getattr(questionlibrary, "total_document_errors", 0) or 0 + payload["total_question_errors"] = str(total_question_errors) + payload["total_document_errors"] = str(total_document_errors) + + return payload + + +def _apply_preview_transform(json_data, questionlibrary): + def replace_placeholders(text): + if not text: + return text + + pattern = r"<<<<(\d+)>>>>" + + def replace_match(match): + image_id = match.group(1) + try: + image = questionlibrary.get_image(int(image_id)) + return image.image or match.group(0) + except Exception: + return match.group(0) + + return re.sub(pattern, replace_match, text) + + def build_title_from_text(text): + if not text: + return None + + has_table = re.search(r"", text) + has_img = re.search(r"]+>", text) + + title_text = text.replace("\n", " ") + title_text = re.sub(r"", "[IMG]", title_text) + title_text = re.sub(r"", "[TABLE]", title_text) + title_text = re.sub(r"<<<<\d+>>>>", "[IMG]", title_text) + + title_text = html_to_plain(title_text) + title_text = trim_text(title_text) + + prefix = "" + if has_table: + prefix = "[TABLE]" + prefix + if has_img: + prefix = "[IMG]" + prefix + + if prefix: + prefix = prefix + " " + title_text = re.sub(r"\s*\[IMG\]", "", title_text).strip() + title_text = re.sub(r"\s*\[TABLE\]", "", title_text).strip() + + title_text = prefix + title_text + return title_text[:127] + + for section in json_data.get("sections", []): + section["text"] = replace_placeholders(section.get("text")) + + for question in section.get("questions", []): + question["text"] = replace_placeholders(question.get("text")) + + if not question.get("title"): + question["title"] = build_title_from_text(question.get("text")) + + for mc in question.get("multiple_choice") or []: + for answer in mc.get("multiple_choice_answers") or []: + answer["answer"] = replace_placeholders(answer.get("answer")) + answer["answer_feedback"] = replace_placeholders(answer.get("answer_feedback")) + + for tf in question.get("true_false") or []: + tf["true_feedback"] = replace_placeholders(tf.get("true_feedback")) + tf["false_feedback"] = replace_placeholders(tf.get("false_feedback")) + + for fib in question.get("fib") or []: + fib["text"] = replace_placeholders(fib.get("text")) + + for ms in question.get("multiple_select") or []: + for answer in ms.get("multiple_select_answers") or []: + answer["answer"] = replace_placeholders(answer.get("answer")) + answer["answer_feedback"] = replace_placeholders(answer.get("answer_feedback")) + + for ordering in question.get("ordering") or []: + ordering["text"] = replace_placeholders(ordering.get("text")) + ordering["ord_feedback"] = replace_placeholders(ordering.get("ord_feedback")) + + for matching in question.get("matching") or []: + for choice in matching.get("matching_choices") or []: + choice["choice_text"] = replace_placeholders(choice.get("choice_text")) + for answer in choice.get("matching_answers") or []: + answer["answer_text"] = replace_placeholders(answer.get("answer_text")) + + for wr in question.get("written_response") or []: + wr["initial_text"] = replace_placeholders(wr.get("initial_text")) + wr["answer_key"] = replace_placeholders(wr.get("answer_key")) + + return json_data diff --git a/api/pipelines/scorm_to_json.py b/api/pipelines/scorm_to_json.py new file mode 100644 index 0000000..875bc58 --- /dev/null +++ b/api/pipelines/scorm_to_json.py @@ -0,0 +1,50 @@ +from os import path +import logging + +from api.serializers import QuestionLibraryPackageSerializer, count_errors +from api.formats.scorm.scorm_extractor import ScormExtractor + +logger = logging.getLogger(__name__) + + +class ScormToJsonError(Exception): + def __init__(self, message): + super().__init__(message) + + +def build_scorm_to_json(instance): + """ + Run the SCORM extractor and return JSON data + QuestionLibrary instance. + """ + scorm_zip_path = instance.temp_file.path + xml_reader = ScormExtractor( + scorm_zip_path, + extract_to_path=path.join(instance.folder_path, "scorm_extract"), + ) + + question_library = xml_reader.populate_django_models(instance) + ql_serializer = QuestionLibraryPackageSerializer(question_library) + json_data = ql_serializer.data + + count_errors(question_library) + json_data["total_question_errors"] = str(question_library.total_question_errors or 0) + json_data["total_document_errors"] = str(question_library.total_document_errors or 0) + + instance.json_data = json_data + instance.save() + + return json_data, question_library + + +def scorm_to_json(instance, logger_instance=None): + """ + High-level function to convert SCORM ZIP to JSON. + Returns the JSON data and QuestionLibrary instance. + """ + log = logger_instance or logger + log.info(f"[{instance.id}] SCORM to JSON conversion started") + + json_data, question_library = build_scorm_to_json(instance) + log.info(f"[{instance.id}] SCORM to JSON conversion completed") + + return json_data, question_library diff --git a/api/pipelines/ws_pipeline.py b/api/pipelines/ws_pipeline.py new file mode 100644 index 0000000..828f07c --- /dev/null +++ b/api/pipelines/ws_pipeline.py @@ -0,0 +1,142 @@ +from bs4 import BeautifulSoup +from api.formats.docx.extract_images import extract_images +from api.formats.docx.formatter import run_formatter +from api.formats.docx.sectioner import run_sectioner +from api.formats.docx.splitter import Splitter +from api.formats.docx.endanswers import get_endanswers +from api.formats.docx.parser import run_parser +from api.formats.docx.convert_txt import convert_txt +from api.formats.docx.fix_numbering import fix_numbering +import socket +from api.tasks import run_pandoc_task +from django.conf import settings +import logging +from api.logging.logging_adapter import FilenameLoggingAdapter + +from api.logging.ErrorTypes import * + +logger = logging.getLogger(__name__) + + +class Process: + def __init__(self, questionlibrary) -> None: + self.questionlibrary = questionlibrary + self.images_extracted = 0 + self.subsection_count = 0 + self.questions_expected = 0 + self.questions_processed = 0 + self.endanswers_count = 0 + self.question_info_count = 0 + self.question_warning_count = 0 + self.question_error_count = 0 + + def run_pandoc(self): + file_logger = FilenameLoggingAdapter( + logger, + { + "filename": self.questionlibrary.temp_file.name, + "user_ip": self.questionlibrary.user_ip, + }, + ) + try: + result = run_pandoc_task.apply_async( + kwargs={"questionlibrary_id": self.questionlibrary.id}, + ignore_result=False, + ) + pandoc_task_result = result.get() + self.questionlibrary.pandoc_output = pandoc_task_result + except Exception as e: + raise Exception(str(e)) + + if self.questionlibrary.pandoc_output is None: + raise MarkDownConversionError("Pandoc output string is empty") + + def convert_txt(self): + convert_txt(self.questionlibrary) + + def fix_numbering(self): + fix_numbering(self.questionlibrary) + + def extract_images(self): + self.images_extracted = extract_images(self.questionlibrary) + + def run_formatter(self): + file_logger = FilenameLoggingAdapter( + logger, + { + "filename": self.questionlibrary.temp_file.name, + "user_ip": self.questionlibrary.user_ip, + }, + ) + file_logger.debug("starting formatter antlr process") + run_formatter(self.questionlibrary) + + # This is to split sections into separate objects + def run_sectioner(self): + file_logger = FilenameLoggingAdapter( + logger, + { + "filename": self.questionlibrary.temp_file.name, + "user_ip": self.questionlibrary.user_ip, + }, + ) + file_logger.debug("starting sectioner antlr process") + self.subsection_count = run_sectioner(self.questionlibrary) + + def run_splitter(self): + file_logger = FilenameLoggingAdapter( + logger, + { + "filename": self.questionlibrary.temp_file.name, + "user_ip": self.questionlibrary.user_ip, + }, + ) + file_logger.debug("starting splitter antlr process") + splitter = Splitter(self.questionlibrary) + self.questions_expected = splitter.run_splitter() + + def get_endanswers(self): + self.endanswers_count = get_endanswers(self.questionlibrary) + + def run_parser(self): + file_logger = FilenameLoggingAdapter( + logger, + { + "filename": self.questionlibrary.temp_file.name, + "user_ip": self.questionlibrary.user_ip, + }, + ) + file_logger.debug("starting questionparser antlr process") + run_parser(self.questionlibrary) + + def sendformat(self, status, statustext, data): + return { + "hostname": socket.gethostname(), + "version": settings.APP_VERSION, + "status": status, + "statustext": statustext, + "images_count": str(self.images_extracted), + "section_count": str(self.subsection_count), + "questions_count": str(self.questions_expected), + "endanswer_count": str(self.endanswers_count), + "question_info_count": str(self.question_info_count), + "question_warning_count": str(self.question_warning_count), + "question_error_count": str(self.question_error_count), + "data": data, + } + + +def run_pipeline(pipeline): + pipeline.run_pandoc() + pipeline.extract_images() + pipeline.run_formatter() + pipeline.run_sectioner() + pipeline.run_splitter() + pipeline.get_endanswers() + pipeline.run_parser() + return pipeline + + +def process(questionlibrary): + pipeline = Process(questionlibrary) + return run_pipeline(pipeline) diff --git a/api/process/process.py b/api/process/process.py deleted file mode 100644 index 0f384fc..0000000 --- a/api/process/process.py +++ /dev/null @@ -1,127 +0,0 @@ -from bs4 import BeautifulSoup -from .extract_images import extract_images -from .formatter import run_formatter -from .sectioner import run_sectioner -from .splitter import Splitter -from .endanswers import get_endanswers -from .parser import run_parser -from .convert_txt import convert_txt -from .fix_numbering import fix_numbering -import socket -from api.tasks import run_pandoc_task -from django.conf import settings -import logging -newlogger = logging.getLogger(__name__) -# from api.logging.contextfilter import QuestionlibraryFilenameFilter -# logger.addFilter(QuestionlibraryFilenameFilter()) -from api.logging.logging_adapter import FilenameLoggingAdapter - -from api.logging.ErrorTypes import * -import os - -class Process: - def __init__(self, questionlibrary) -> None: - self.questionlibrary = questionlibrary - self.images_extracted = 0 - self.subsection_count = 0 - self.questions_expected = 0 - self.questions_processed = 0 - self.endanswers_count = 0 - self.question_info_count = 0 - self.question_warning_count = 0 - self.question_error_count = 0 - - def run_pandoc(self): - logger = FilenameLoggingAdapter(newlogger, { - 'filename': self.questionlibrary.temp_file.name, - 'user_ip': self.questionlibrary.user_ip - }) - try: - result = run_pandoc_task.apply_async(kwargs={"questionlibrary_id":self.questionlibrary.id}, ignore_result=False) - pandoc_task_result = result.get() - # logger.debug(pandoc_task_result) - self.questionlibrary.pandoc_output = pandoc_task_result - except Exception as e: - raise Exception(str(e)) - - if self.questionlibrary.pandoc_output == None: - raise MarkDownConversionError("Pandoc output string is empty") - - def convert_txt(self): - convert_txt(self.questionlibrary) - - def fix_numbering(self): - # logger = FilenameLoggingAdapter(newlogger, { - # 'filename': self.questionlibrary.temp_file.name, - # 'user_ip': self.questionlibrary.user_ip - # }) - # logger.debug("starting pandoc html to md") - # try: - # result = convert_html_to_md.apply_async(kwargs={"questionlibrary_id":self.questionlibrary.id}, ignore_result=False) - # convert_html_to_md_task_result = result.get() - # logger.debug("pdf to md result") - # logger.debug(convert_html_to_md_task_result) - # self.questionlibrary.txt_output = convert_html_to_md_task_result - # self.questionlibrary.save() - # except Exception as e: - # raise Exception(str(e)) - - fix_numbering(self.questionlibrary) - - def extract_images(self): - self.images_extracted = extract_images(self.questionlibrary) - - def run_formatter(self): - logger = FilenameLoggingAdapter(newlogger, { - 'filename': self.questionlibrary.temp_file.name, - 'user_ip': self.questionlibrary.user_ip - }) - logger.debug("starting formatter antlr process") - run_formatter(self.questionlibrary) - - # This is to split sections into separate objects - def run_sectioner(self): - logger = FilenameLoggingAdapter(newlogger, { - 'filename': self.questionlibrary.temp_file.name, - 'user_ip': self.questionlibrary.user_ip - }) - logger.debug("starting sectioner antlr process") - self.subsection_count = run_sectioner(self.questionlibrary) - - def run_splitter(self): - logger = FilenameLoggingAdapter(newlogger, { - 'filename': self.questionlibrary.temp_file.name, - 'user_ip': self.questionlibrary.user_ip - }) - logger.debug("starting splitter antlr process") - splitter = Splitter(self.questionlibrary) - self.questions_expected = splitter.run_splitter() - - def get_endanswers(self): - self.endanswers_count = get_endanswers(self.questionlibrary) - - def run_parser(self): - logger = FilenameLoggingAdapter(newlogger, { - 'filename': self.questionlibrary.temp_file.name, - 'user_ip': self.questionlibrary.user_ip - }) - logger.debug("starting questionparser antlr process") - run_parser(self.questionlibrary) - - def sendformat(self, status, statustext, data): - - return { - 'hostname': socket.gethostname(), - 'version': settings.APP_VERSION, - 'status': status, - 'statustext': statustext, - 'images_count': str(self.images_extracted), - 'section_count': str(self.subsection_count), - 'questions_count': str(self.questions_expected), - 'endanswer_count': str(self.endanswers_count), - 'question_info_count': str(self.question_info_count), - 'question_warning_count': str(self.question_warning_count), - 'question_error_count': str(self.question_error_count), - 'data': data - } -# ++++++++++++++++++++++++++++++++=================================== diff --git a/api/questions/__init__.py b/api/questions/__init__.py new file mode 100644 index 0000000..39d70bd --- /dev/null +++ b/api/questions/__init__.py @@ -0,0 +1 @@ +# Question domain logic. diff --git a/api/questions/model_builders/__init__.py b/api/questions/model_builders/__init__.py new file mode 100644 index 0000000..0baf102 --- /dev/null +++ b/api/questions/model_builders/__init__.py @@ -0,0 +1 @@ +# Question builders. diff --git a/api/process/questionbuilder/fib.py b/api/questions/model_builders/fib.py similarity index 98% rename from api/process/questionbuilder/fib.py rename to api/questions/model_builders/fib.py index 8bf9574..a7e4d2d 100644 --- a/api/process/questionbuilder/fib.py +++ b/api/questions/model_builders/fib.py @@ -1,6 +1,6 @@ from ...models import Fib import re -from ..process_helper import markdown_to_plain +from api.formats.docx.process_helper import markdown_to_plain def build_inline_FIB(question): question.questiontype = 'FIB' diff --git a/api/process/questionbuilder/matching.py b/api/questions/model_builders/matching.py similarity index 98% rename from api/process/questionbuilder/matching.py rename to api/questions/model_builders/matching.py index fc1e15c..830a182 100644 --- a/api/process/questionbuilder/matching.py +++ b/api/questions/model_builders/matching.py @@ -1,6 +1,6 @@ import re from ...models import Matching, MatchingChoice, MatchingAnswer -from ..process_helper import add_error_message, trim_text, markdown_to_html +from api.formats.docx.process_helper import add_error_message, trim_text, markdown_to_html from api.logging.ErrorTypes import MATNoMatchError, MATMissingChoiceError, MATMissingAnswerError def build_inline_MAT(question, answers): diff --git a/api/process/questionbuilder/multiplechoice.py b/api/questions/model_builders/multiplechoice.py similarity index 96% rename from api/process/questionbuilder/multiplechoice.py rename to api/questions/model_builders/multiplechoice.py index c9a4c21..15e5efd 100644 --- a/api/process/questionbuilder/multiplechoice.py +++ b/api/questions/model_builders/multiplechoice.py @@ -1,6 +1,6 @@ import re from ...models import MultipleChoice, MultipleChoiceAnswer -from ..process_helper import add_warning_message, trim_text, trim_md_to_plain, trim_md_to_html +from api.formats.docx.process_helper import add_warning_message, trim_text, trim_md_to_plain, trim_md_to_html from api.logging.WarningTypes import MCEndAnswerExistWarning from celery.utils.log import get_task_logger from api.logging.logging_adapter import FilenameLoggingAdapter diff --git a/api/process/questionbuilder/multipleselect.py b/api/questions/model_builders/multipleselect.py similarity index 92% rename from api/process/questionbuilder/multipleselect.py rename to api/questions/model_builders/multipleselect.py index 9957185..c7c5dcc 100644 --- a/api/process/questionbuilder/multipleselect.py +++ b/api/questions/model_builders/multipleselect.py @@ -1,6 +1,6 @@ import re from ...models import MultipleSelect, MultipleSelectAnswer -from ..process_helper import add_warning_message, trim_text, trim_md_to_html, trim_md_to_plain +from api.formats.docx.process_helper import add_warning_message, trim_text, trim_md_to_html, trim_md_to_plain from api.logging.WarningTypes import MSEndAnswerExistWarning def build_inline_MS(question, answers, is_random, enumeration): @@ -19,7 +19,7 @@ def build_inline_MS(question, answers, is_random, enumeration): for answer_order, answer_item in enumerate(answers): ms_answerobject = MultipleSelectAnswer.objects.create(multiple_select=ms_object) answer_index = trim_text(answer_item.get('answer_prefix')) - ms_answerobject.index = re.sub(r'[\W_]', '', answer_index) + ms_answerobject.index = re.sub(r'[\\W_]', '', answer_index) ms_answerobject.order = answer_order + 1 ms_answerobject.answer = trim_md_to_html(answer_item.get('answer_content')) answer_feedback = answer_item.get('feedback') @@ -57,7 +57,7 @@ def build_endanswer_MS(question, answers, endanswer, is_random, enumeration): for idx, answer_item in enumerate(answers): ms_answerobject = MultipleSelectAnswer.objects.create(multiple_select=ms_object) answer_index = trim_text(answer_item.get('answer_prefix')) - ms_answerobject.index = re.sub(r'[\W_]', '', answer_index) + ms_answerobject.index = re.sub(r'[\\W_]', '', answer_index) ms_answerobject.order = idx + 1 ms_answerobject.answer = trim_md_to_html(answer_item.get('answer_content')) answer_feedback = answer_item.get('feedback') diff --git a/api/process/questionbuilder/ordering.py b/api/questions/model_builders/ordering.py similarity index 94% rename from api/process/questionbuilder/ordering.py rename to api/questions/model_builders/ordering.py index c6c3f98..2ccb7c5 100644 --- a/api/process/questionbuilder/ordering.py +++ b/api/questions/model_builders/ordering.py @@ -1,5 +1,5 @@ from ...models import Ordering -from ..process_helper import trim_md_to_html +from api.formats.docx.process_helper import trim_md_to_html def build_inline_ORD(question, answers): question.questiontype = 'ORD' diff --git a/api/process/questionbuilder/truefalse.py b/api/questions/model_builders/truefalse.py similarity index 97% rename from api/process/questionbuilder/truefalse.py rename to api/questions/model_builders/truefalse.py index f185773..151b0aa 100644 --- a/api/process/questionbuilder/truefalse.py +++ b/api/questions/model_builders/truefalse.py @@ -1,5 +1,5 @@ from ...models import TrueFalse -from ..process_helper import add_error_message, trim_text, trim_md_to_html, markdown_to_plain +from api.formats.docx.process_helper import add_error_message, trim_text, trim_md_to_html, markdown_to_plain from api.logging.ErrorTypes import TFNoAnswerError, TFSelectedAnswerError from celery.utils.log import get_task_logger from api.logging.logging_adapter import FilenameLoggingAdapter diff --git a/api/process/questionbuilder/writtenresponse.py b/api/questions/model_builders/writtenresponse.py similarity index 94% rename from api/process/questionbuilder/writtenresponse.py rename to api/questions/model_builders/writtenresponse.py index a984410..40f931c 100644 --- a/api/process/questionbuilder/writtenresponse.py +++ b/api/questions/model_builders/writtenresponse.py @@ -1,5 +1,5 @@ from ...models import WrittenResponse -from ..process_helper import add_warning_message, trim_md_to_html +from api.formats.docx.process_helper import add_warning_message, trim_md_to_html from api.logging.WarningTypes import WREndAnswerExistWarning def build_inline_WR_with_keyword(question, wr_answer): diff --git a/api/scorm/XmlReader.py b/api/scorm/XmlReader.py deleted file mode 100644 index 12e2516..0000000 --- a/api/scorm/XmlReader.py +++ /dev/null @@ -1,1841 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. - -import os -import xml.etree.cElementTree as ET -from zipfile import ZipFile -from os import path, makedirs -from django.conf import settings -from bs4 import BeautifulSoup -import re -import base64 -import html -from api.models import ( - QuestionLibrary, Section, Question, - MultipleChoice, MultipleChoiceAnswer, - TrueFalse, Fib, MultipleSelect, MultipleSelectAnswer, - Matching, MatchingChoice, MatchingAnswer, - Ordering, WrittenResponse -) -from api.models import ( - QuestionLibrary, Section, Question, - MultipleChoice, MultipleChoiceAnswer, - TrueFalse, Fib, MultipleSelect, MultipleSelectAnswer, - Matching, MatchingChoice, MatchingAnswer, - Ordering, WrittenResponse -) - - -class XmlReader: - """ - Reads and parses SCORM XML files (questiondb.xml, imsmanifest.xml) - and extracts data into Django models. - This class mirrors the structure of XmlWriter but in reverse. - """ - - def __init__(self, scorm_zip_path, extract_to_path=None): - """ - Initialize XmlReader with a SCORM ZIP file path. - - Args: - scorm_zip_path: Path to the SCORM ZIP file - extract_to_path: Optional path to extract ZIP contents (defaults to temp directory) - """ - self.scorm_zip_path = scorm_zip_path - self.extract_to_path = extract_to_path - self.questiondb_xml = None - self.imsmanifest_xml = None - self.extracted_path = None - - # Extract ZIP file - self._extract_zip() - - # Parse XML files - self._parse_xml_files() - - def _extract_zip(self): - """Extract SCORM ZIP file to temporary directory.""" - if not path.exists(self.scorm_zip_path): - raise FileNotFoundError(f"SCORM ZIP file not found: {self.scorm_zip_path}") - - # Create extraction directory if not provided - if self.extract_to_path is None: - # Use a temp directory based on the ZIP filename - zip_basename = path.splitext(path.basename(self.scorm_zip_path))[0] - self.extract_to_path = path.join(settings.MEDIA_ROOT, f"scorm_extract_{zip_basename}") - - # Create directory if it doesn't exist - if not path.exists(self.extract_to_path): - makedirs(self.extract_to_path) - - # Extract ZIP file - with ZipFile(self.scorm_zip_path, 'r') as zip_ref: - zip_ref.extractall(self.extract_to_path) - - self.extracted_path = self.extract_to_path - - def _parse_xml_files(self): - """Parse questiondb.xml and imsmanifest.xml from extracted files.""" - questiondb_path = path.join(self.extracted_path, "questiondb.xml") - imsmanifest_path = path.join(self.extracted_path, "imsmanifest.xml") - - if not path.exists(questiondb_path): - raise FileNotFoundError(f"questiondb.xml not found in SCORM package: {questiondb_path}") - - if not path.exists(imsmanifest_path): - raise FileNotFoundError(f"imsmanifest.xml not found in SCORM package: {imsmanifest_path}") - - # Parse XML files - self.questiondb_xml = ET.parse(questiondb_path) - self.imsmanifest_xml = ET.parse(imsmanifest_path) - - def parse_manifest(self): - """ - Parse imsmanifest.xml and extract metadata. - - Returns: - dict: Dictionary containing manifest metadata - """ - root = self.imsmanifest_xml.getroot() - - manifest_data = { - 'identifier': root.get('identifier', ''), - 'resources': [] - } - - # Parse resources - resources_el = root.find('resources') - if resources_el is not None: - for resource_el in resources_el.findall('resource'): - resource_data = { - 'identifier': resource_el.get('identifier', ''), - 'type': resource_el.get('type', ''), - 'material_type': resource_el.get('{http://desire2learn.com/xsd/d2lcp_v2p0}material_type', ''), - 'href': resource_el.get('href', ''), - 'link_target': resource_el.get('{http://desire2learn.com/xsd/d2lcp_v2p0}link_target', ''), - 'title': resource_el.get('title', '') - } - manifest_data['resources'].append(resource_data) - - return manifest_data - - def parse_questiondb(self): - """ - Parse questiondb.xml and extract question library structure. - - Returns: - dict: Dictionary containing question library data structure - """ - root = self.questiondb_xml.getroot() - - # Find objectbank element - objectbank_el = root.find('objectbank') - if objectbank_el is None: - raise ValueError("objectbank element not found in questiondb.xml") - - question_library_data = { - 'ident': objectbank_el.get('ident', ''), - 'sections': [] - } - - # Parse base section (root section) - base_sections = objectbank_el.findall('section') - for section_el in base_sections: - section_data = self._parse_section(section_el) - question_library_data['sections'].append(section_data) - - return question_library_data - - def _parse_section(self, section_el): - """ - Parse a section element and extract section data. - - Args: - section_el: XML element representing a section - - Returns: - dict: Dictionary containing section data - """ - section_data = { - 'ident': section_el.get('ident', ''), - 'title': section_el.get('title', ''), - 'shuffle': False, - 'is_title_displayed': True, - 'is_text_displayed': False, - 'text': '', - 'questions': [] - } - - # Check for shuffle (selection_ordering with Random order) - selection_ordering = section_el.find('selection_ordering') - if selection_ordering is not None: - order_el = selection_ordering.find('order') - if order_el is not None and order_el.get('order_type') == 'Random': - section_data['shuffle'] = True - - # Parse presentation material (section text) - presentation_material = section_el.find('presentation_material') - if presentation_material is not None: - text = self._extract_material_text(presentation_material) - section_data['text'] = text - - # Parse sectionproc_extension - sectionproc = section_el.find('sectionproc_extension') - if sectionproc is not None: - display_name = sectionproc.find('{http://desire2learn.com/xsd/d2lcp_v2p0}display_section_name') - if display_name is not None: - section_data['is_title_displayed'] = display_name.text.lower() == 'yes' - - type_display = sectionproc.find('{http://desire2learn.com/xsd/d2lcp_v2p0}type_display_section') - if type_display is not None: - section_data['is_text_displayed'] = type_display.text == '1' - - # Parse nested sections - nested_sections = section_el.findall('section') - for nested_section_el in nested_sections: - nested_section_data = self._parse_section(nested_section_el) - section_data['sections'] = section_data.get('sections', []) - section_data['sections'].append(nested_section_data) - - # Parse questions (items) - items = section_el.findall('item') - for item_el in items: - question_data = self._parse_question(item_el) - section_data['questions'].append(question_data) - - return section_data - - def _parse_question(self, item_el): - """ - Parse a question (item) element and extract question data. - - Args: - item_el: XML element representing a question item - - Returns: - dict: Dictionary containing question data - """ - question_data = { - 'ident': item_el.get('ident', ''), - 'label': item_el.get('label', ''), - 'title': item_el.get('title', ''), - 'question_type': None, - 'points': 1.0, - 'text': '', - 'hint': None, - 'feedback': None, - 'question_specific_data': {} - } - - # Parse itemmetadata to get question type and points - itemmetadata = item_el.find('itemmetadata') - if itemmetadata is not None: - qtidata = itemmetadata.find('qtimetadata') - if qtidata is not None: - for field in qtidata.findall('qti_metadatafield'): - fieldlabel = field.find('fieldlabel') - fieldentry = field.find('fieldentry') - if fieldlabel is not None and fieldentry is not None: - if fieldlabel.text == 'qmd_questiontype': - question_data['question_type'] = fieldentry.text - elif fieldlabel.text == 'qmd_weighting': - try: - question_data['points'] = float(fieldentry.text) - except (ValueError, TypeError): - pass - - # Parse presentation to get question text - presentation = item_el.find('presentation') - if presentation is not None: - question_text = self._extract_question_text(presentation) - question_data['text'] = question_text - - # Parse hint - hint_el = item_el.find('hint') - if hint_el is not None: - question_data['hint'] = self._extract_hint_text(hint_el) - - # Parse general feedback - feedback_els = item_el.findall('itemfeedback') - for feedback_el in feedback_els: - # General feedback typically has ident matching the question label - if feedback_el.get('ident') == question_data['label']: - question_data['feedback'] = self._extract_feedback_text(feedback_el) - - # Parse question-specific data based on type - question_type = question_data['question_type'] - if question_type: - if question_type == 'Multiple Choice': - question_data['question_specific_data'] = self._parse_multiple_choice(item_el, question_data['label']) - question_data['question_type_code'] = 'MC' - elif question_type == 'True/False': - question_data['question_specific_data'] = self._parse_true_false(item_el, question_data['label']) - question_data['question_type_code'] = 'TF' - elif question_type == 'Fill in the Blanks': - question_data['question_specific_data'] = self._parse_fill_in_the_blanks(item_el, question_data['label']) - question_data['question_type_code'] = 'FIB' - elif question_type == 'Multi-Select': - question_data['question_specific_data'] = self._parse_multi_select(item_el, question_data['label']) - question_data['question_type_code'] = 'MS' - elif question_type == 'Matching': - question_data['question_specific_data'] = self._parse_matching(item_el, question_data['label']) - question_data['question_type_code'] = 'MAT' - elif question_type == 'Ordering': - question_data['question_specific_data'] = self._parse_ordering(item_el, question_data['label']) - question_data['question_type_code'] = 'ORD' - elif question_type == 'Long Answer': - question_data['question_specific_data'] = self._parse_written_response(item_el, question_data['label']) - question_data['question_type_code'] = 'WR' - - return question_data - - def _extract_material_text(self, material_el): - """ - Extract text content from material element, handling CDATA. - Automatically cleans CDATA whitespace and HTML tags. - Converts SCORM image file paths to base64 data URIs. - Decodes HTML entities (including numeric entities for emojis and symbols). - """ - text_parts = [] - - # Navigate through flow_mat -> material -> mattext - flow_mat = material_el.find('flow_mat') - if flow_mat is not None: - materials = flow_mat.findall('.//material') - for material in materials: - mattext = material.find('mattext') - if mattext is not None: - # Get text content (handles CDATA) - raw_text = mattext.text if mattext.text else '' - # Also check for CDATA in tail - if mattext.tail: - raw_text += mattext.tail - # Decode HTML entities (handles &, <, >, 🤣, etc.) - decoded_text = html.unescape(raw_text) - # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata(decoded_text) - # Convert SCORM image file paths to base64 - cleaned_text = self._inline_scorm_images(cleaned_text) - text_parts.append(cleaned_text) - - return ''.join(text_parts) - - def _extract_question_text(self, presentation_el): - """ - Extract question text from presentation element. - Automatically cleans CDATA whitespace and HTML tags. - Converts SCORM image file paths to base64 data URIs. - Decodes HTML entities (including numeric entities for emojis and symbols). - """ - text_parts = [] - - flow = presentation_el.find('flow') - if flow is not None: - # Find first material element (question text) - material = flow.find('material') - if material is not None: - mattext = material.find('mattext') - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - if mattext.tail: - raw_text += mattext.tail - # Decode HTML entities (handles &, <, >, 🤣, etc.) - decoded_text = html.unescape(raw_text) - # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata(decoded_text) - # Convert SCORM image file paths to base64 - cleaned_text = self._inline_scorm_images(cleaned_text) - text_parts.append(cleaned_text) - - return ''.join(text_parts) - - def _extract_hint_text(self, hint_el): - """Extract text from hint element.""" - hintmaterial = hint_el.find('hintmaterial') - if hintmaterial is not None: - return self._extract_material_text(hintmaterial) - return None - - def _extract_feedback_text(self, feedback_el): - """ - Extract text from feedback element. - Automatically cleans CDATA whitespace while preserving HTML tags. - Converts SCORM image file paths to base64 data URIs. - Decodes HTML entities (including numeric entities for emojis and symbols). - """ - material = feedback_el.find('material') - if material is not None: - mattext = material.find('mattext') - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - # Decode HTML entities (handles &, <, >, 🤣, etc.) - decoded_text = html.unescape(raw_text) - # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata(decoded_text) - # Convert SCORM image file paths to base64 - return self._inline_scorm_images(cleaned_text) - return None - - def _clean_cdata(self, text): - """ - Clean text extracted from CDATA sections in SCORM XML. - - SCORM XML often contains CDATA with excessive whitespace, newlines, and tabs - that are formatting artifacts rather than meaningful content. This method: - 1. Preserves HTML tags (e.g.,

, , etc.) - 2. Normalizes whitespace between HTML tags (multiple spaces/newlines/tabs -> single space) - 3. Trims leading/trailing whitespace - - This ensures clean JSON output while preserving HTML structure for proper rendering. - - Args: - text: Raw text string from XML CDATA - - Returns: - str: Cleaned text with normalized whitespace but HTML tags preserved - """ - if not text: - return '' - - try: - # Normalize whitespace while preserving HTML tags - # Replace sequences of whitespace (spaces, tabs, newlines) with a single space - # But be careful not to break HTML tag structure - cleaned = re.sub(r'[ \t\n\r]+', ' ', text) - # Remove whitespace between HTML tags (e.g., "> <" -> "><") - cleaned = re.sub(r'>\s+<', '><', cleaned) - # Trim leading/trailing whitespace - cleaned = cleaned.strip() - return cleaned - except Exception: - # Fallback: if regex fails, just normalize whitespace - cleaned = re.sub(r'\s+', ' ', text).strip() - return cleaned - - def _inline_scorm_images(self, html_text): - """ - Convert SCORM image file paths to base64 data URIs in HTML text. - - SCORM packages store images as files (e.g., ./assessment-assets/.../image_1.png) - in the ZIP. This method extracts those images and converts them to base64 - data URIs so the JSON is self-contained. - - Args: - html_text: HTML text containing tags with file paths - - Returns: - str: HTML text with image file paths replaced with base64 data URIs - """ - if not html_text or not self.extracted_path: - return html_text - - # Find all img tags with src attributes - img_pattern = r']*?)src=["\']([^"\']+)["\']([^>]*?)>' - - def replace_image(match): - before_src = match.group(1) - img_src = match.group(2) - after_src = match.group(3) - - # Skip if already base64 or data URI - if img_src.startswith('data:') or 'base64' in img_src: - return match.group(0) - - # Skip if absolute URL - if img_src.startswith('http://') or img_src.startswith('https://'): - return match.group(0) - - try: - # Extract image path (remove leading ./ if present) - img_path = img_src.lstrip('./') - - # Try to find the image file in the extracted SCORM directory - # SCORM images are typically in assessment-assets folder - possible_paths = [ - path.join(self.extracted_path, img_path), - path.join(self.extracted_path, 'assessment-assets', path.basename(img_path)), - ] - - # Also try to find in any subdirectory - image_file = None - for possible_path in possible_paths: - if path.exists(possible_path) and path.isfile(possible_path): - image_file = possible_path - break - - # If not found, search recursively - if not image_file: - for root, dirs, files in os.walk(self.extracted_path): - if path.basename(img_path) in files: - image_file = path.join(root, path.basename(img_path)) - break - - if image_file and path.exists(image_file): - # Read image file and convert to base64 - with open(image_file, 'rb') as f: - image_data = f.read() - base64_data = base64.b64encode(image_data).decode('utf-8') - - # Determine MIME type from file extension - ext = path.splitext(image_file)[1].lower() - mime_types = { - '.png': 'image/png', - '.jpg': 'image/jpeg', - '.jpeg': 'image/jpeg', - '.gif': 'image/gif', - '.svg': 'image/svg+xml', - '.webp': 'image/webp' - } - mime_type = mime_types.get(ext, 'image/png') - - # Replace with base64 data URI - base64_src = f'data:{mime_type};base64,{base64_data}' - import logging - logger = logging.getLogger(__name__) - logger.info(f"Converted SCORM image {path.basename(image_file)} to base64 ({len(base64_data)} chars)") - return f'' - else: - # Image file not found, log warning - import logging - logger = logging.getLogger(__name__) - logger.warning(f"SCORM image not found: {img_src} (searched in {self.extracted_path})") - # Return original img tag (will show as broken image or alt text) - return match.group(0) - except Exception as e: - # If any error occurs, return original img tag - return match.group(0) - - # Replace all img tags - result = re.sub(img_pattern, replace_image, html_text) - return result - - def _html_to_markdown(self, html_text): - """ - Convert HTML text with base64 images to markdown format. - - Preserves ALL tags as HTML (Pandoc supports HTML in markdown). - Converts remaining HTML to plain text. - - Args: - html_text: HTML text containing image tags (base64 or file paths) - - Returns: - str: Markdown formatted text with img tags preserved as HTML - """ - if not html_text: - return '' - - # Extract ALL img tags (both base64 and file paths) and preserve them as HTML - # Pattern to match any img tag - img_pattern = r']*?>' - - # Store HTML img tags temporarily with placeholders - html_images = {} - image_counter = 0 - - def preserve_img_tag(match): - nonlocal image_counter - # Preserve the entire img tag as HTML - full_img_tag = match.group(0) - placeholder = f'__HTML_IMAGE_{image_counter}__' - html_images[placeholder] = full_img_tag - image_counter += 1 - return placeholder - - # Preserve MathML blocks with placeholders so we can convert them to TeX - math_blocks = {} - math_counter = 0 - math_pattern = r'' - - def preserve_math(match): - nonlocal math_counter - full_math = match.group(0) - placeholder = f'__MATH_BLOCK_{math_counter}__' - # Try to extract TeX from annotation - tex_match = re.search( - r']*encoding=["\']application/x-tex["\'][^>]*>(.*?)', - full_math, - flags=re.IGNORECASE | re.DOTALL - ) - tex = tex_match.group(1) if tex_match else None - math_blocks[placeholder] = {"tex": tex, "raw": full_math} - math_counter += 1 - return placeholder - - # Replace all img tags and math blocks with placeholders - result = re.sub(img_pattern, preserve_img_tag, html_text) - result = re.sub(math_pattern, preserve_math, result, flags=re.IGNORECASE) - - # Replace

and

tags with newlines to preserve paragraph breaks - # Do this before BeautifulSoup processes it to ensure paragraph breaks are preserved - result = re.sub(r'

', '\n', result, flags=re.IGNORECASE) - result = re.sub(r']*>', '\n', result, flags=re.IGNORECASE) - - # Convert remaining HTML to plain text using BeautifulSoup - try: - soup = BeautifulSoup(result, 'html.parser') - # Replace
with a placeholder so only those become hard breaks - for br in soup.find_all('br'): - br.replace_with('[[[BR]]]') - # Extract text while keeping other inline tags tight (no extra newlines) - text = soup.get_text(separator=' ', strip=False) - # Turn our placeholders into real newlines - text = text.replace('[[[BR]]]', '\n') - except Exception: - # Fallback: if BeautifulSoup fails, just clean up HTML tags manually - # But preserve placeholders - text = re.sub(r'<(?!/?__HTML_IMAGE_)[^>]+>', '', result) - - # Restore MathML (prefer TeX) and HTML img tags from placeholders - for placeholder, math_info in math_blocks.items(): - replacement = None - if math_info.get("tex"): - tex = math_info["tex"].strip() - replacement = f"$$ {tex} $$" - else: - replacement = math_info.get("raw", "") - text = text.replace(placeholder, replacement) - for placeholder, html_img in html_images.items(): - text = text.replace(placeholder, html_img) - - # Normalize whitespace on each line but keep explicit newlines and math/img blocks - text = text.replace('\r', '') - # Normalize multiple consecutive newlines (from paragraph breaks) to double newlines - # This ensures paragraphs are separated by blank lines in markdown - text = re.sub(r'\n{3,}', '\n\n', text) # 3+ newlines -> 2 newlines - normalized_lines = [] - for line in text.split('\n'): - stripped = line.strip() - if stripped == '': - normalized_lines.append('') - continue - if re.search(r']*>', stripped, flags=re.IGNORECASE) or re.search(r'= 1: - true_ident = response_labels[0].get('ident', '') - if len(response_labels) >= 2: - false_ident = response_labels[1].get('ident', '') - - resprocessing = item_el.find('resprocessing') - - if resprocessing is not None: - for respcondition in resprocessing.findall('respcondition'): - conditionvar = respcondition.find('conditionvar') - if conditionvar is not None: - varequal = conditionvar.find('varequal') - if varequal is not None and varequal.get('respident') == question_lid: - answer_ident = varequal.text - - # Match answer_ident to determine if it's True or False - if true_ident and answer_ident == true_ident: - setvar = respcondition.find('setvar') - if setvar is not None: - try: - tf_data['true_weight'] = float(setvar.text) - except (ValueError, TypeError): - pass - - # Get feedback - displayfeedback = respcondition.find('displayfeedback') - if displayfeedback is not None: - feedback_ident = displayfeedback.get('linkrefid', '') - feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") - if feedback_el is not None: - tf_data['true_feedback'] = self._extract_feedback_text(feedback_el) - - elif false_ident and answer_ident == false_ident: - setvar = respcondition.find('setvar') - if setvar is not None: - try: - tf_data['false_weight'] = float(setvar.text) - except (ValueError, TypeError): - pass - - # Get feedback - displayfeedback = respcondition.find('displayfeedback') - if displayfeedback is not None: - feedback_ident = displayfeedback.get('linkrefid', '') - feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") - if feedback_el is not None: - tf_data['false_feedback'] = self._extract_feedback_text(feedback_el) - - return tf_data - - def _parse_fill_in_the_blanks(self, item_el, question_ident): - """ - Parse fill in the blanks question data. - Mirrors generate_fill_in_the_blanks() from XmlWriter. - """ - fib_data = { - 'fibs': [] # List of fibquestion and fibanswer items in order - } - - presentation = item_el.find('presentation') - if presentation is None: - return fib_data - - flow = presentation.find('flow') - if flow is None: - return fib_data - - # Parse flow elements in order (alternating fibquestion and fibanswer) - idx = 1 - for child in flow: - if child.tag == 'material': - # This is a fibquestion (text part) - mattext = child.find('mattext') - text = '' - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - # Decode HTML entities (handles &, <, >, 🤣, etc.) - # Don't clean CDATA for FIB - preserve original spacing, but decode entities - text = html.unescape(raw_text) - - fib_data['fibs'].append({ - 'type': 'fibquestion', - 'text': text, - 'order': idx - }) - - elif child.tag == 'response_str': - # This is a fibanswer (blank) - question_ans = question_ident + str(idx) + "_ANS" - - # Find answers from resprocessing - answers = [] - resprocessing = item_el.find('resprocessing') - if resprocessing is not None: - for respcondition in resprocessing.findall('respcondition'): - conditionvar = respcondition.find('conditionvar') - if conditionvar is not None: - varequal = conditionvar.find('varequal') - if varequal is not None and varequal.get('respident') == question_ans: - answer_text = varequal.text if varequal.text else '' - if answer_text: - answers.append(answer_text) - - fib_data['fibs'].append({ - 'type': 'fibanswer', - 'text': ','.join(answers) if answers else '', - 'order': idx, - 'size': 30 # Default from XmlWriter - }) - idx += 1 - - return fib_data - - def _parse_multi_select(self, item_el, question_ident): - """ - Parse multi-select question data. - Mirrors generate_multi_select() from XmlWriter. - """ - ms_data = { - 'randomize': False, - 'enumeration': 4, - 'style': 2, - 'grading_type': 2, - 'answers': [] - } - - presentation = item_el.find('presentation') - if presentation is None: - return ms_data - - flow = presentation.find('flow') - if flow is None: - return ms_data - - # Parse response_extension - response_ext = flow.find('response_extension') - if response_ext is not None: - enumeration_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}enumeration') - if enumeration_el is not None and enumeration_el.text: - try: - ms_data['enumeration'] = int(enumeration_el.text) - except (ValueError, TypeError): - pass - - grading_type_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}grading_type') - if grading_type_el is not None and grading_type_el.text: - try: - ms_data['grading_type'] = int(grading_type_el.text) - except (ValueError, TypeError): - pass - - # Parse response_lid - response_lid = flow.find('response_lid') - if response_lid is not None: - question_lid = response_lid.get('ident', '') - - # Check shuffle - render_choice = response_lid.find('render_choice') - if render_choice is not None: - ms_data['randomize'] = render_choice.get('shuffle', 'no').lower() == 'yes' - - # Parse answers - answer_index = 1 - for flow_label in response_lid.findall('.//flow_label'): - response_label = flow_label.find('response_label') - if response_label is not None: - answer_ident = response_label.get('ident', '') - - # Extract answer text - mattext = response_label.find('.//mattext') - answer_text = '' - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - # Decode HTML entities (handles &, <, >, 🤣, etc.) - decoded_text = html.unescape(raw_text) - # Clean CDATA whitespace while preserving HTML tags - answer_text = self._clean_cdata(decoded_text) - - # Determine if correct from resprocessing - is_correct = False - answer_feedback = None - resprocessing = item_el.find('resprocessing') - if resprocessing is not None: - for respcondition in resprocessing.findall('respcondition'): - conditionvar = respcondition.find('conditionvar') - if conditionvar is not None: - varequal = conditionvar.find('varequal') - if varequal is not None and varequal.get('respident') == question_lid: - if varequal.text == answer_ident: - setvar = respcondition.find('setvar') - if setvar is not None: - # If setvar adds to D2L_Correct, it's a correct answer - if setvar.get('varname') == 'D2L_Correct': - is_correct = True - - # Find answer-specific feedback - displayfeedback = respcondition.find('displayfeedback') - if displayfeedback is not None: - feedback_ident = displayfeedback.get('linkrefid', '') - feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") - if feedback_el is not None: - answer_feedback = self._extract_feedback_text(feedback_el) - - ms_data['answers'].append({ - 'answer': answer_text, - 'is_correct': is_correct, - 'answer_feedback': answer_feedback, - 'order': answer_index - }) - answer_index += 1 - - return ms_data - - def _parse_matching(self, item_el, question_ident): - """ - Parse matching question data. - Mirrors generate_matching() from XmlWriter. - """ - mat_data = { - 'grading_type': 0, - 'choices': [] - } - - presentation = item_el.find('presentation') - if presentation is None: - return mat_data - - flow = presentation.find('flow') - if flow is None: - return mat_data - - # Parse response_extension for grading_type - response_ext = flow.find('response_extension') - if response_ext is not None: - grading_type_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}grading_type') - if grading_type_el is not None and grading_type_el.text: - try: - mat_data['grading_type'] = int(grading_type_el.text) - except (ValueError, TypeError): - pass - - # Collect all unique matching answers first (from all render_choices) - matching_answers = {} - - # Find all response_grp elements (one per choice) - response_grps = flow.findall('response_grp') - - # First pass: collect all possible answers from all choices - for response_grp in response_grps: - render_choice = response_grp.find('render_choice') - if render_choice is not None: - # Find all response_label elements directly (they may all be in one flow_label) - for response_label in render_choice.findall('.//response_label'): - answer_ident = response_label.get('ident', '') - mattext = response_label.find('.//mattext') - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - # Clean CDATA whitespace while preserving HTML tags - answer_text = self._clean_cdata(raw_text) - if answer_text and answer_ident not in matching_answers: - matching_answers[answer_ident] = answer_text - - # Second pass: process each choice and find its correct answer - for response_grp in response_grps: - choice_ident = response_grp.get('respident', '') - - # Get choice text from material - material = response_grp.find('material') - choice_text = '' - if material is not None: - mattext = material.find('mattext') - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - # Decode HTML entities (handles &, <, >, 🤣, etc.) - decoded_text = html.unescape(raw_text) - # Clean CDATA whitespace while preserving HTML tags - choice_text = self._clean_cdata(decoded_text) - - # Find correct answer from resprocessing - correct_answer_ident = None - resprocessing = item_el.find('resprocessing') - if resprocessing is not None: - for respcondition in resprocessing.findall('respcondition'): - conditionvar = respcondition.find('conditionvar') - if conditionvar is not None: - varequal = conditionvar.find('varequal') - if varequal is not None and varequal.get('respident') == choice_ident: - setvar = respcondition.find('setvar') - if setvar is not None and setvar.get('varname') == 'D2L_Correct': - correct_answer_ident = varequal.text - break # Found the correct answer for this choice - - # Build matching answers list for this choice - matching_answers_list = [] - if correct_answer_ident and correct_answer_ident in matching_answers: - matching_answers_list.append({ - 'answer_text': matching_answers[correct_answer_ident] - }) - - mat_data['choices'].append({ - 'choice_text': choice_text, - 'matching_answers': matching_answers_list - }) - - return mat_data - - def _parse_ordering(self, item_el, question_ident): - """ - Parse ordering question data. - Mirrors generate_ordering() from XmlWriter. - """ - ord_data = { - 'items': [] - } - - presentation = item_el.find('presentation') - if presentation is None: - return ord_data - - flow = presentation.find('flow') - if flow is None: - return ord_data - - # Find response_grp with rcardinality="Ordered" - response_grp = flow.find('response_grp[@rcardinality="Ordered"]') - if response_grp is None: - return ord_data - - render_choice = response_grp.find('render_choice') - if render_choice is None: - return ord_data - - # Parse ordering items - # Find all response_label elements directly (they may all be in one flow_label) - order_index = 1 - for response_label in render_choice.findall('.//response_label'): - ident_num = response_label.get('ident', '') - - # Extract text - mattext = response_label.find('.//mattext') - text = '' - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - # Decode HTML entities (handles &, <, >, 🤣, etc.) - decoded_text = html.unescape(raw_text) - # Clean CDATA whitespace while preserving HTML tags - text = self._clean_cdata(decoded_text) - - # Find feedback - ord_feedback = None - question_ident_feedback = question_ident + "_IF" - feedback_ident = question_ident_feedback + str(order_index) - feedback_el = item_el.find(f".//itemfeedback[@ident='{feedback_ident}']") - if feedback_el is not None: - ord_feedback = self._extract_feedback_text(feedback_el) - - ord_data['items'].append({ - 'text': text, - 'order': order_index, - 'ord_feedback': ord_feedback - }) - order_index += 1 - - return ord_data - - def _parse_written_response(self, item_el, question_ident): - """ - Parse written response question data. - Mirrors generate_written_response() from XmlWriter. - """ - wr_data = { - 'enable_student_editor': False, - 'initial_text': None, - 'answer_key': '', - 'enable_attachments': False - } - - # Parse response_extension - presentation = item_el.find('presentation') - if presentation is not None: - flow = presentation.find('flow') - if flow is not None: - response_ext = flow.find('response_extension') - if response_ext is not None: - editor_el = response_ext.find('{http://desire2learn.com/xsd/d2lcp_v2p0}has_htmleditor') - if editor_el is not None: - editor_text = editor_el.text if editor_el.text else '' - wr_data['enable_student_editor'] = editor_text.lower() == 'yes' - - # Parse answer_key - answer_key_el = item_el.find('answer_key') - if answer_key_el is not None: - answer_key_mat = answer_key_el.find('answer_key_material') - if answer_key_mat is not None: - mattext = answer_key_mat.find('.//mattext') - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - # Clean CDATA whitespace while preserving HTML tags - wr_data['answer_key'] = self._clean_cdata(raw_text) - - # Parse initial_text (if present) - initial_text_el = item_el.find('initial_text') - if initial_text_el is not None: - initial_text_mat = initial_text_el.find('initial_text_material') - if initial_text_mat is not None: - mattext = initial_text_mat.find('.//mattext') - if mattext is not None: - raw_text = mattext.text if mattext.text else '' - # Decode HTML entities (handles &, <, >, 🤣, etc.) - decoded_text = html.unescape(raw_text) - # Clean CDATA whitespace while preserving HTML tags - cleaned_text = self._clean_cdata(decoded_text) - wr_data['initial_text'] = cleaned_text if cleaned_text else None - - return wr_data - - def populate_django_models(self, question_library=None): - """ - Populate Django models from parsed SCORM XML data. - - Args: - question_library: Optional existing QuestionLibrary instance to use. - If None, a new one will be created. - - Returns: - QuestionLibrary: The QuestionLibrary instance with all sections and questions - """ - # Parse questiondb to get structure - question_library_data = self.parse_questiondb() - - # Get main title from first section (base section) - main_title = '' - if question_library_data['sections']: - main_title = question_library_data['sections'][0].get('title', '') - - # Use existing QuestionLibrary or create a new one - if question_library is None: - question_library = QuestionLibrary.objects.create( - main_title=main_title, - shuffle=False # Will be set from section data - ) - else: - # Update existing instance with parsed data - question_library.main_title = main_title - question_library.save() - - # Process sections - section_order = 1 - question_index = 1 # Global question index that continues across all sections - for section_data in question_library_data['sections']: - has_nested_sections = len(section_data.get('sections', [])) > 0 - has_direct_questions = len(section_data.get('questions', [])) > 0 - has_text = section_data.get('text', '').strip() != '' - - # If root section has questions or text, create it as the first section (is_main_content=True) - # This section represents the main_title and should be in the sections array - if has_direct_questions or has_text: - # Create the root section as the first section with is_main_content=True - section = Section.objects.create( - question_library=question_library, - is_main_content=True, - order=section_order, - title=section_data.get('title', ''), - is_title_displayed=section_data.get('is_title_displayed', True), - text=section_data.get('text', ''), - is_text_displayed=section_data.get('is_text_displayed', False), - shuffle=section_data.get('shuffle', False) - ) - - # Process questions in this section (continue question_index) - for question_data in section_data.get('questions', []): - question = self._create_question_model(section, question_data, question_index) - question_index += 1 - - # Process nested sections (if any) - for nested_section_data in section_data.get('sections', []): - nested_section = Section.objects.create( - question_library=question_library, - is_main_content=False, - order=section_order + 1, - title=nested_section_data.get('title', ''), - is_title_displayed=nested_section_data.get('is_title_displayed', True), - text=nested_section_data.get('text', ''), - is_text_displayed=nested_section_data.get('is_text_displayed', False), - shuffle=nested_section_data.get('shuffle', False) - ) - - # Process questions in nested section (continue question_index) - for question_data in nested_section_data.get('questions', []): - question = self._create_question_model(nested_section, question_data, question_index) - question_index += 1 - - section_order += 1 - - section_order += 1 - elif has_nested_sections: - # Root section has nested sections but no questions/text - don't create Section for it - # Only process nested sections - for nested_section_data in section_data.get('sections', []): - nested_section = Section.objects.create( - question_library=question_library, - is_main_content=False, - order=section_order, - title=nested_section_data.get('title', ''), - is_title_displayed=nested_section_data.get('is_title_displayed', True), - text=nested_section_data.get('text', ''), - is_text_displayed=nested_section_data.get('is_text_displayed', False), - shuffle=nested_section_data.get('shuffle', False) - ) - - # Process questions in nested section (continue question_index) - for question_data in nested_section_data.get('questions', []): - question = self._create_question_model(nested_section, question_data, question_index) - question_index += 1 - - section_order += 1 - - return question_library - - def _create_question_model(self, section, question_data, index): - """Create a Question model and related question type models from parsed data.""" - question = Question.objects.create( - section=section, - index=index, - title=question_data.get('title', ''), - questiontype=question_data.get('question_type_code', ''), - text=question_data.get('text', ''), - points=question_data.get('points', 1.0), - hint=question_data.get('hint'), - feedback=question_data.get('feedback') - ) - - question_type_code = question_data.get('question_type_code', '') - specific_data = question_data.get('question_specific_data', {}) - - if question_type_code == 'MC': - self._create_multiple_choice_model(question, specific_data) - elif question_type_code == 'TF': - self._create_true_false_model(question, specific_data) - elif question_type_code == 'FIB': - self._create_fib_model(question, specific_data) - elif question_type_code == 'MS': - self._create_multiple_select_model(question, specific_data) - elif question_type_code == 'MAT': - self._create_matching_model(question, specific_data) - elif question_type_code == 'ORD': - self._create_ordering_model(question, specific_data) - elif question_type_code == 'WR': - self._create_written_response_model(question, specific_data) - - return question - - def _create_multiple_choice_model(self, question, mc_data): - """Create MultipleChoice and MultipleChoiceAnswer models.""" - mc = MultipleChoice.objects.create( - question=question, - randomize=mc_data.get('randomize', False), - enumeration=mc_data.get('enumeration', 4) - ) - - for answer_data in mc_data.get('answers', []): - MultipleChoiceAnswer.objects.create( - multiple_choice=mc, - order=answer_data.get('order', 1), - answer=answer_data.get('answer', ''), - answer_feedback=answer_data.get('answer_feedback'), - weight=answer_data.get('weight', 0.0) - ) - - def _create_true_false_model(self, question, tf_data): - """Create TrueFalse model.""" - TrueFalse.objects.create( - question=question, - true_weight=tf_data.get('true_weight', 0.0), - true_feedback=tf_data.get('true_feedback'), - false_weight=tf_data.get('false_weight', 0.0), - false_feedback=tf_data.get('false_feedback'), - enumeration=tf_data.get('enumeration', 4) - ) - - def _create_fib_model(self, question, fib_data): - """Create Fib models for fill in the blanks.""" - for fib_item in fib_data.get('fibs', []): - Fib.objects.create( - question=question, - type=fib_item.get('type', 'fibquestion'), - text=fib_item.get('text', ''), - order=fib_item.get('order', 1), - size=fib_item.get('size') - ) - - def _create_multiple_select_model(self, question, ms_data): - """Create MultipleSelect and MultipleSelectAnswer models.""" - ms = MultipleSelect.objects.create( - question=question, - randomize=ms_data.get('randomize', False), - enumeration=ms_data.get('enumeration', 4), - style=ms_data.get('style', 2), - grading_type=ms_data.get('grading_type', 2) - ) - - for answer_data in ms_data.get('answers', []): - MultipleSelectAnswer.objects.create( - multiple_select=ms, - order=answer_data.get('order', 1), - answer=answer_data.get('answer', ''), - answer_feedback=answer_data.get('answer_feedback'), - is_correct=answer_data.get('is_correct', False) - ) - - def _create_matching_model(self, question, mat_data): - """Create Matching, MatchingChoice, and MatchingAnswer models.""" - matching = Matching.objects.create( - question=question, - grading_type=mat_data.get('grading_type', 0) - ) - - for choice_data in mat_data.get('choices', []): - matching_choice = MatchingChoice.objects.create( - matching=matching, - choice_text=choice_data.get('choice_text', '') - ) - - for answer_data in choice_data.get('matching_answers', []): - MatchingAnswer.objects.create( - matching_choice=matching_choice, - answer_text=answer_data.get('answer_text', '') - ) - - def _create_ordering_model(self, question, ord_data): - """Create Ordering models.""" - for item_data in ord_data.get('items', []): - Ordering.objects.create( - question=question, - text=item_data.get('text', ''), - order=item_data.get('order', 1), - ord_feedback=item_data.get('ord_feedback') - ) - - def _create_written_response_model(self, question, wr_data): - """Create WrittenResponse model.""" - WrittenResponse.objects.create( - question=question, - enable_student_editor=wr_data.get('enable_student_editor', False), - initial_text=wr_data.get('initial_text'), - answer_key=wr_data.get('answer_key', ''), - enable_attachments=wr_data.get('enable_attachments', False) - ) - - def format_to_markdown(self, question_library): - """ - Format parsed questions from Django models into markdown/text format - that matches the formatter_output structure (body text with questions). - This reconstructs the markdown that would have come from the original DOCX. - This can then be converted to DOCX using pandoc. - - Args: - question_library: QuestionLibrary Django model instance - - Returns: - str: Markdown formatted text (formatter_output format) ready for DOCX conversion - """ - lines = [] - - # Add main title as H1 heading if it exists - if question_library.main_title: - # Clean HTML from main title - main_title = question_library.main_title - try: - soup = BeautifulSoup(main_title, 'html.parser') - main_title = soup.get_text(separator=' ', strip=True) - except: - main_title = re.sub(r'\s+', ' ', main_title).strip() - lines.append(f"# {main_title}") - lines.append("") # Add blank line after title - - # Add root-level text (main_text) if present - if getattr(question_library, "main_text", None): - # Convert HTML (including embedded images) to markdown for SCORM output - main_text = self._html_to_markdown(question_library.main_text) - lines.append(main_text) - lines.append("") - - # Process sections - sections = question_library.get_sections() - for section in sections: - # For main content sections (is_main_content=True), skip #section markers and section title - # The main title is already displayed as H1 above - if not section.is_main_content: - # Add section title if present and should be displayed (## for markdown heading) - if section.title and section.is_title_displayed: - # Clean HTML from section title for display - section_title_display = section.title - try: - soup = BeautifulSoup(section_title_display, 'html.parser') - section_title_display = soup.get_text(separator=' ', strip=True) - except: - section_title_display = re.sub(r'\s+', ' ', section_title_display).strip() - lines.append("") - lines.append("
") - lines.append("#section") - lines.append(f"## {section_title_display}") - - # Add section text if present - # For main content sections: only display if is_text_displayed is true - # For non-main-content sections: always display if text exists (regardless of is_text_displayed) - should_display_text = False - if section.is_main_content: - should_display_text = section.text and section.is_text_displayed - else: - should_display_text = bool(section.text) # Display if text exists - - if should_display_text: - # Convert HTML with base64 images to markdown - section_text = self._html_to_markdown(section.text) - lines.append(section_text) - - # Process questions in this section - questions = section.get_questions() - for idx, question in enumerate(questions): - question_markdown = self._format_question_to_markdown(question) - lines.append(question_markdown) - - # Add /section marker after the last question for non-main-content sections - if not section.is_main_content and idx == len(questions) - 1: - # Last question - add /section right after it - lines.append("") - lines.append("
") - lines.append("/section") - - # If section has no questions, still add /section for non-main-content sections - if not section.is_main_content and len(questions) == 0: - lines.append("") - lines.append("
") - lines.append("/section") - - # Join with newlines and ensure proper formatting - result = "\n".join(lines) - if result and not result.endswith("\n"): - result += "\n" - return result - - def _format_question_to_markdown(self, question): - """ - Format a single question to markdown format matching the raw_content format - that the ANTLR questionparser expects. - Format: [number.] Type: ... Title: ... Points: ... [question text] [answers] [@Hint:] [@Feedback:] - """ - lines = [] - - # Question header: Type, Title, Points, Randomize (each on separate line) - # Each header on its own line - if question.questiontype: - lines.append("") - lines.append("
") - lines.append(f"Type: {question.questiontype}") - if question.title: - lines.append(f"Title: {question.title}") - if question.points: - # Normalize points: remove trailing zeros and decimal if not needed (e.g., 1.0000 -> 1, 1.5 -> 1.5) - normalized_points = str(float(question.points)).rstrip('0').rstrip('.') - lines.append(f"Points: {normalized_points}") - # Add Randomize if set on MC/MS question types (mirrors docx -> json randomize parsing) - randomize_value = None - if question.questiontype == 'MC': - mc = question.get_multiple_choice() - if mc and mc.randomize is not None: - randomize_value = mc.randomize - elif question.questiontype == 'MS': - ms = question.get_multiple_select() - if ms and ms.randomize is not None: - randomize_value = ms.randomize - if randomize_value is True: - lines.append("Randomize: yes") - - # Add question text (HTML format from SCORM, convert to markdown preserving base64 images) - # Prefix with question number if available (e.g., "1. Question text") - # Note: For FIB questions, skip displaying question.text here since FIB formatting includes all text parts - if question.text and question.questiontype != 'FIB': - # Convert HTML with base64 images to markdown - question_text = self._html_to_markdown(question.text) - - # Extract plain text for numbering (remove markdown formatting) - plain_text = re.sub(r'!\[.*?\]\([^)]+\)', '', question_text) # Remove image markdown - plain_text = re.sub(r'<[^>]+>', '', plain_text) # Remove any remaining HTML tags - plain_text = re.sub(r'\s+', ' ', plain_text).strip() - - # Prefix with question number if available - question_number = None - if question.index is not None: - question_number = question.index - elif question.number_provided is not None: - question_number = question.number_provided - - if question_number is not None: - lines.append(f"{question_number}. {question_text}") - else: - lines.append(question_text) - - # Format question-specific content based on type - question_type = question.questiontype - if question_type == 'MC': - answer_text = self._format_multiple_choice_markdown(question) - if answer_text: - lines.append(answer_text) - elif question_type == 'TF': - answer_text = self._format_true_false_markdown(question) - if answer_text: - lines.append(answer_text) - elif question_type == 'FIB': - answer_text = self._format_fib_markdown(question) - if answer_text: - # For FIB questions, prefix with question number since we skipped question.text above - question_number = None - if question.index is not None: - question_number = question.index - elif question.number_provided is not None: - question_number = question.number_provided - - if question_number is not None: - lines.append(f"{question_number}. {answer_text}") - else: - lines.append(answer_text) - elif question_type == 'MS': - answer_text = self._format_multi_select_markdown(question) - if answer_text: - lines.append(answer_text) - elif question_type == 'MAT': - answer_text = self._format_matching_markdown(question) - if answer_text: - lines.append(answer_text) - elif question_type == 'ORD': - answer_text = self._format_ordering_markdown(question) - if answer_text: - lines.append(answer_text) - elif question_type == 'WR': - answer_text = self._format_written_response_markdown(question) - if answer_text: - lines.append(answer_text) - - # Add hint if present (format: @Hint: or @HINT:) - if question.hint: - hint_text = self._html_to_markdown(question.hint) - lines.append(f"@Hint: {hint_text}") - - # Add feedback if present (format: @Feedback: or @FEEDBACK:) - if question.feedback: - feedback_text = self._html_to_markdown(question.feedback) - lines.append(f"@Feedback: {feedback_text}") - - # Use double newlines so each logical line becomes a paragraph (hard breaks, not soft) - return "\n\n".join(lines) - - def _format_multiple_choice_markdown(self, question): - """ - Format multiple choice question answers. - Format: a. [answer text] or *a. [answer text] for correct answers - """ - lines = [] - mc = question.get_multiple_choice() - if mc: - answers = mc.get_multiple_choice_answers() - for idx, answer in enumerate(answers, start=1): - letter = chr(96 + idx) # a, b, c, etc. - # Correct answer has * before the letter (weight > 0) - marker = "*" if answer.weight and answer.weight > 0 else "" - # Convert HTML with base64 images to markdown - answer_text = self._html_to_markdown(answer.answer) - # Indent as level 2 list (4 spaces for markdown level 2) - lines.append(f" {letter}. {marker}{answer_text}") - if answer.answer_feedback: - feedback_text = self._html_to_markdown(answer.answer_feedback) - lines.append(f" @Feedback: {feedback_text}") - return "\n".join(lines) - - def _format_true_false_markdown(self, question): - """ - Format true/false question answers. - Format: a. True / b. False with * after letter for correct answer (e.g., a. *True) - """ - lines = [] - tf = question.get_true_false() - if tf: - true_marker = "*" if tf.true_weight and tf.true_weight > 0 else "" - false_marker = "*" if tf.false_weight and tf.false_weight > 0 else "" - # Indent as level 2 list (4 spaces for markdown level 2) - lines.append(f" a. {true_marker}True") - if tf.true_feedback: - feedback_text = self._html_to_markdown(tf.true_feedback) - lines.append(f" @Feedback: {feedback_text}") - lines.append(f" b. {false_marker}False") - if tf.false_feedback: - feedback_text = self._html_to_markdown(tf.false_feedback) - lines.append(f" @Feedback: {feedback_text}") - return "\n".join(lines) - - def _format_fib_markdown(self, question): - """ - Format fill in the blanks question. - Format: Question text with [answer] markers where answers go - Example: "A [rose,flower] by any other name would smell as [sweet,good]." - Note: Clean HTML tags but preserve spacing (CDATA cleaning was skipped during parsing). - """ - lines = [] - fibs = question.get_fibs() - current_text = "" - for fib in fibs: - if fib.type == 'fibquestion': - if fib.text: - # Convert HTML with base64 images to markdown, preserving spacing - cleaned_text = self._html_to_markdown(fib.text) - current_text += cleaned_text - elif fib.type == 'fibanswer': - # Insert answer in brackets [answer] where the blank should be - if fib.text: - current_text += f" [{fib.text}]" - else: - current_text += " [ ]" - if current_text: - lines.append(current_text) - return "\n".join(lines) - - def _format_multi_select_markdown(self, question): - """ - Format multi-select question answers. - Format: a. [answer] or *a. [answer] for correct answers - """ - lines = [] - ms = question.get_multiple_select() - if ms: - answers = ms.get_multiple_select_answers() - for idx, answer in enumerate(answers, start=1): - letter = chr(96 + idx) # a, b, c, etc. - marker = "*" if answer.is_correct else "" - # Convert HTML with base64 images to markdown - answer_text = self._html_to_markdown(answer.answer) - # Indent as level 2 list (4 spaces for markdown level 2) - lines.append(f" {letter}. {marker}{answer_text}") - if answer.answer_feedback: - feedback_text = self._html_to_markdown(answer.answer_feedback) - lines.append(f" @Feedback: {feedback_text}") - return "\n".join(lines) - - def _format_matching_markdown(self, question): - """ - Format matching question. - Format: a. choice_text = answer_text (on same line, with enumeration) - Preserves inline HTML styling (bold, italic, etc.) but removes block-level tags (p, div, etc.) - """ - lines = [] - matching = question.get_matching() - if matching: - choices = matching.get_matching_choices() - for idx, choice in enumerate(choices, start=1): - letter = chr(96 + idx) # a, b, c, etc. - - # Convert HTML with base64 images to markdown (preserves inline styling and images) - choice_text = self._html_to_markdown(choice.choice_text) - - # Use the related manager matching_answers (from ForeignKey in MatchingAnswer) - answers = choice.matching_answers.all() - if answers: - # Get the first matching answer (typically there's one per choice) - answer = answers[0] - answer_text = self._html_to_markdown(answer.answer_text) - # Indent as level 2 list (4 spaces for markdown level 2) - lines.append(f" {letter}. {choice_text} = {answer_text}") - else: - # No answer found, just show choice - lines.append(f" {letter}. {choice_text} =") - return "\n".join(lines) - - def _strip_block_tags(self, html_text): - """ - Remove block-level HTML tags (p, div, etc.) but preserve inline styling tags (strong, em, b, i, etc.). - This allows formatting like bold/italic to be preserved while removing tags that cause line breaks. - Returns HTML string with inline tags preserved. - """ - if not html_text: - return '' - - try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(html_text, 'html.parser') - - # Unwrap block-level tags (these cause line breaks) but preserve their content and inline tags - block_tags = ['p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ul', 'ol'] - for tag_name in block_tags: - for tag in soup.find_all(tag_name): - # Unwrap removes the tag but keeps its content (including inline tags) - tag.unwrap() - - # Get the HTML string with inline tags preserved - result = str(soup) - # Clean up: remove leading/trailing whitespace and normalize internal whitespace - # But preserve HTML tag structure - result = re.sub(r'>\s+<', '><', result) # Remove whitespace between tags - result = re.sub(r'\s+', ' ', result) # Normalize whitespace - result = result.strip() - return result - except Exception: - # Fallback: if parsing fails, just clean whitespace but preserve HTML structure - cleaned = re.sub(r'>\s+<', '><', html_text) - cleaned = re.sub(r'\s+', ' ', cleaned).strip() - return cleaned - - def _format_ordering_markdown(self, question): - """ - Format ordering question. - Format: lettered list (a., b., c., etc.) with HTML tags cleaned, indented as level 2 list - """ - lines = [] - orderings = question.get_orderings() - for idx, ordering in enumerate(orderings, start=1): - letter = chr(96 + idx) # a, b, c, etc. - # Convert HTML with base64 images to markdown - ordering_text = self._html_to_markdown(ordering.text) - # Indent as level 2 list (4 spaces for markdown level 2) - lines.append(f" {letter}. {ordering_text}") - if ordering.ord_feedback: - feedback_text = self._html_to_markdown(ordering.ord_feedback) - lines.append(f" @Feedback: {feedback_text}") - return "\n".join(lines) - - def _format_written_response_markdown(self, question): - """ - Format written response question. - Format: Blank line, then "Correct Answer:" indented, then indented answer text. - Use double newlines to ensure hard paragraph breaks (not soft returns) in DOCX. - """ - lines = [] - wr = question.get_written_response() - if wr and wr.answer_key: - # Add blank line first (double newline for hard paragraph break) - lines.append("") - # Convert HTML with base64 images to markdown - answer_text = self._html_to_markdown(wr.answer_key) - # Indent with regular spaces (3 for label, 7 for answer) to mimic margin - # Avoid 4+ leading spaces to prevent markdown list or code block detection - lines.append(f"Correct Answer:") - lines.append(f"{answer_text}") - # Use double newlines so each logical line becomes a paragraph (hard breaks) - return "\n\n".join(lines) - - def convert_markdown_to_docx(self, markdown_text, output_path): - """ - Convert markdown text to DOCX file using pandoc (reverse of run_pandoc_task). - This is the final step to generate DOCX from the formatted markdown. - - Args: - markdown_text: Markdown formatted text (from format_to_markdown) - output_path: Path where the DOCX file should be saved - - Returns: - str: Path to the created DOCX file - """ - import pypandoc - import tempfile - import os - - # Create a temporary markdown file - with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as temp_md: - temp_md.write(markdown_text) - temp_md_path = temp_md.name - - try: - # Convert markdown to DOCX using pandoc (reverse of DOCX → markdown) - # Use similar settings as the forward conversion but in reverse - pypandoc.convert_file( - temp_md_path, - format='markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars', - to='docx+empty_paragraphs', - outputfile=output_path, - extra_args=[ - '--no-highlight', - '--preserve-tabs', - '--wrap=preserve', - '--indent=false', - '--mathml', - '--ascii' - ] - ) - finally: - # Clean up temporary markdown file - if os.path.exists(temp_md_path): - os.unlink(temp_md_path) - - return output_path diff --git a/api/scorm/XmlWriter.py b/api/scorm/XmlWriter.py deleted file mode 100644 index 5f7626e..0000000 --- a/api/scorm/XmlWriter.py +++ /dev/null @@ -1,755 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. - -import copy -from difflib import Match -import os -import random -import shutil -import datetime -import re -import time -import xml.etree.cElementTree as ET -from uuid import UUID -from .xmlcdata import CDATA -from os import makedirs, path, walk -from os.path import basename -from django.conf import settings -from xml.dom.minidom import parseString -from zipfile import * - - -class XmlWriter: - def __init__(self, question_library): - ident = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") - questionLibraryIdent = "QLIB_" + ident - root_el = ET.Element("questestinterop") - objectbank_el = ET.SubElement(root_el, "objectbank", {"ident": questionLibraryIdent, "xmlns:d2l_2p0": "http://desire2learn.com/xsd/d2lcp_v2p0"}) - - # root_section_obj = question_library.get_root_section() - # root_section_el = self.create_section(objectbank_el, root_section_obj) - - base_ident = "SECT_" + str(datetime.datetime.now().strftime("%Y%m%d%H%M%S")) + str(int(UUID(int=0x12345678123456781234567812345678))) - base_section_el = ET.SubElement(objectbank_el, "section", {"ident": base_ident, "title": question_library.main_title}) - if question_library.shuffle is True: - self.create_section_shuffle(base_section_el) - - self.create_presentation_material(base_section_el, question_library.main_text) # include root-level text when present - - sec_proc = ET.SubElement(base_section_el, "sectionproc_extension") - sec_proc_dis_name = ET.SubElement(sec_proc, "d2l_2p0:display_section_name") - # TODO: add is_title_displayed and text to QuestionLibrary because not all exam has root section - sec_proc_dis_name.text = "yes" # section_obj.is_title_displayed if section_obj.is_title_displayed else "yes" - sec_proc_dis_line = ET.SubElement(sec_proc, "d2l_2p0:display_section_line") - sec_proc_dis_line.text = "no" - sec_proc_dis_sec = ET.SubElement(sec_proc, "d2l_2p0:type_display_section") - sec_proc_dis_sec.text = "0" # "1" if section_obj.is_text_displayed else "0" - - - section_objs = question_library.get_sections() - for section_obj in section_objs: - if section_obj.is_main_content is True: - root_question_objs = section_obj.get_questions() - self.create_questions(base_section_el, root_question_objs) - else: - current_section_el = self.create_section(base_section_el, section_obj) - question_objs = section_obj.get_questions() - self.create_questions(current_section_el, question_objs) - self.questiondb_string = self.xml_to_string(root_el) - - - def create_section(self, parent_el, section_obj): - sectionIdent = "SECT_" + str(datetime.datetime.now().strftime("%Y%m%d%H%M%S")) + str(int(UUID(int=0x12345678123456781234567812345678))) - section_el = ET.SubElement(parent_el, "section", {"ident": sectionIdent, "title": section_obj.title}) - if section_obj.shuffle is True: - self.create_section_shuffle(section_el) - - self.create_presentation_material(section_el, section_obj.text) - self.create_sectionproc_extension(section_el, section_obj) - - return section_el - - - def create_section_shuffle(self, section_el): - # section > selection_ordering > order - sel_ord = ET.SubElement(section_el, "selection_ordering") - sel_ord_ord = ET.SubElement(sel_ord, "order", {"order_type": "Random"}) - - - def create_presentation_material(self, section_el, section_text): - # presentation_material Node - sec_pres_mat = ET.SubElement(section_el, "presentation_material") - sec_pres_mat_flo = ET.SubElement(sec_pres_mat, "flow_mat") - sec_pres_mat_flo_flo = ET.SubElement(sec_pres_mat_flo, "flow_mat") - sec_pres_mat_flo_flo_mat = ET.SubElement(sec_pres_mat_flo_flo, "material") - sec_pres_mat_flo_flo_mat_text = ET.SubElement(sec_pres_mat_flo_flo_mat, "mattext", {"texttype": "text/html"}) - if section_text: - sec_pres_mat_flo_flo_mat_text.append(CDATA(section_text)) - - - def create_sectionproc_extension(self, section_el, section_obj): - # presentation_material Node - sec_proc = ET.SubElement(section_el, "sectionproc_extension") - sec_proc_dis_name = ET.SubElement(sec_proc, "d2l_2p0:display_section_name") - sec_proc_dis_name.text = section_obj.is_title_displayed if section_obj.is_title_displayed else "yes" - sec_proc_dis_line = ET.SubElement(sec_proc, "d2l_2p0:display_section_line") - sec_proc_dis_line.text = "no" - sec_proc_dis_sec = ET.SubElement(sec_proc, "d2l_2p0:type_display_section") - sec_proc_dis_sec.text = "1" if section_obj.is_text_displayed else "0" - - - def create_questions(self, section_el, question_objs): - for question in question_objs: - time_ns = str(time.process_time_ns()) - random_int = str(random.randint(1000000, 9999999)) - ident = time_ns + random_int - question_ident = "QUES_" + ident - item_el = ET.Element("item", {"ident": "OBJ_" + ident, "label": question_ident, "d2l_2p0:page": "1", "title": question.title}) - # question_type = question.get_question_type() - question_type = question.questiontype - match question_type: - case "MC": - self.generate_multiple_choice(item_el, question_ident, question) - case "TF": - self.generate_true_false(item_el, question_ident, question) - case "FIB" | "FMB": - self.generate_fill_in_the_blanks(item_el, question_ident, question) - case "MS" | "MR": - self.generate_multi_select(item_el, question_ident, question) - case "MAT" | "MT": - self.generate_matching(item_el, question_ident, question) - case "ORD": - self.generate_ordering(item_el, question_ident, question) - case "WR" | "E": - self.generate_written_response(item_el, question_ident, question) - - section_el.append(item_el) - - - def itemetadata(self, it, question_type, question): - # ItemData Node - it_metadata = ET.SubElement(it, "itemmetadata") - it_metadata_qtidata = ET.SubElement(it_metadata, "qtimetadata") - it_computer_scored = ET.SubElement(it_metadata_qtidata, "qti_metadatafield") - it_computer_scored_label = ET.SubElement(it_computer_scored, "fieldlabel") - it_computer_scored_label.text = "qmd_computerscored" - it_computer_scored_entry = ET.SubElement(it_computer_scored, "fieldentry") - it_computer_scored_entry.text = "yes" - it_question_type = ET.SubElement(it_metadata_qtidata, "qti_metadatafield") - it_question_type_label = ET.SubElement(it_question_type, "fieldlabel") - it_question_type_label.text = "qmd_questiontype" - it_question_type_entry = ET.SubElement(it_question_type, "fieldentry") - it_question_type_entry.text = question_type - it_weighting = ET.SubElement(it_metadata_qtidata, "qti_metadatafield") - it_weighting_label = ET.SubElement(it_weighting, "fieldlabel") - it_weighting_label.text = "qmd_weighting" - it_weighting_entry = ET.SubElement(it_weighting, "fieldentry") - it_weighting_entry.text = "{:.4f}".format(question.points) - - - def itemproc_extension(self, it): - # Itemproc_extension Node - it_proc = ET.SubElement(it, "itemproc_extension") - it_proc_difficulty = ET.SubElement(it_proc, "d2l_2p0:difficulty") - it_proc_difficulty.text = "1" - it_proc_isbonus = ET.SubElement(it_proc, "d2l_2p0:isbonus") - it_proc_isbonus.text = "no" - it_proc_ismandatory = ET.SubElement(it_proc, "d2l_2p0:ismandatory") - it_proc_ismandatory.text = "no" - - - def generate_feedback(self, it, ident, feedback): - it_fb = ET.SubElement(it, "itemfeedback", {"ident": ident}) - it_fb_mat = ET.SubElement(it_fb, "material") - it_fb_mat_text = ET.SubElement(it_fb_mat, "mattext", {"texttype": "text/html"}) - it_fb_mat_text.append(CDATA(feedback)) - - - def generate_hint(self, it, hint): - it_hint = ET.SubElement(it, "hint") - it_hint_mat = ET.SubElement(it_hint, "hintmaterial") - it_hint_mat_flow = ET.SubElement(it_hint_mat, "flow_mat") - it_hint_mat_flow_mat = ET.SubElement(it_hint_mat_flow, "material") - it_hint_mat_flow_text = ET.SubElement(it_hint_mat_flow_mat, "mattext", {"texttype": "text/html"}) - it_hint_mat_flow_text.append(CDATA(hint)) - - - def xml_to_string(self, xml): - rough_string = ET.tostring(xml, "utf-8") - reparsed = parseString(rough_string) - pretty_xml = reparsed.toprettyxml(indent="\t") - return pretty_xml - - - def create_manifest(self, manifest_entity, folder_path): - path = folder_path + "/imsmanifest.xml" - root = ET.Element("manifest", {"xmlns:d2l_2p0": "http://desire2learn.com/xsd/d2lcp_v2p0", "xmlns": "http://www.imsglobal.org/xsd/imscp_v1p1", "identifier": "MANIFEST_1"}) - doc = ET.SubElement(root, "resources") - - for resource in manifest_entity.resources: - ET.SubElement( - doc, "resource", {"identifier": resource.identifier, "type": resource.resource_type, "d2l_2p0:material_type": resource.material_type, "href": resource.href, "d2l_2p0:link_target": resource.link_target, "title": resource.title} - ) - - tree = ET.ElementTree(root) - # tree.write(path) - return tree - - - def generate_multiple_choice(self, it, question_ident, question): - self.itemetadata(it, "Multiple Choice", question) - self.itemproc_extension(it) - question_lid = question_ident + "_LID" - question_ident_answer = question_ident + "_A" - question_ident_feedback = question_ident + "_IF" - - # Presentation Node - it_pre = ET.SubElement(it, "presentation") - it_pre_flow = ET.SubElement(it_pre, "flow") - - # Presentation -> Flow - it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") - - # Presentation -> Material - multiple_choice = question.get_multiple_choice() - it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) - question_text = question.text - it_pre_flow_mat_text.append(CDATA(question_text)) - - # Presentation -> Flow -> Response_extension - it_pre_flow_res = ET.SubElement(it_pre_flow, "response_extension") - it_pre_flow_res_display_style = ET.SubElement(it_pre_flow_res, "d2l_2p0:display_style") - it_pre_flow_res_display_style.text = "2" - it_pre_flow_res_enumeration = ET.SubElement(it_pre_flow_res, "d2l_2p0:enumeration") - it_pre_flow_res_enumeration.text = str(multiple_choice.enumeration) if multiple_choice.enumeration else "4" - it_pre_flow_res_grading_type = ET.SubElement(it_pre_flow_res, "d2l_2p0:grading_type") - it_pre_flow_res_grading_type.text = "0" - # Presentation -> Flow -> Response_lid - it_pre_flow_lid = ET.SubElement(it_pre_flow, "response_lid", {"ident": question_lid, "rcardinality": "Multiple"}) - - # Commented this to deactivate MC randomized answer order - it_pre_flow_lid_render_choice = ET.SubElement(it_pre_flow_lid, "render_choice", {"shuffle": ("yes" if multiple_choice.randomize else "no")}) - - # Add hint - if question.hint: - self.generate_hint(it, question.hint) - - # Reprocessing - it_res = ET.SubElement(it, "resprocessing") - - # Add General feedback - if question.feedback: - self.generate_feedback(it, question_ident, question.feedback) - - mc_answer_index = 1 - for mc_answer in multiple_choice.get_multiple_choice_answers(): - - # Presentation -> Flow -> Response_lid -> Render_choice -> Flow_label - flow = ET.SubElement(it_pre_flow_lid_render_choice, "flow_label", {"class": "Block"}) - response_label = ET.SubElement(flow, "response_label", {"ident": question_ident_answer + str(mc_answer_index)}) - flow_mat = ET.SubElement(response_label, "flow_mat") - material = ET.SubElement(flow_mat, "material") - mattext = ET.SubElement(material, "mattext", {"texttype": "text/html"}) - mattext.append(CDATA(mc_answer.answer)) - - # Reprocessing -> Respcondition - it_res_con = ET.SubElement(it_res, "respcondition", {"title": "Response Condition" + str(mc_answer_index)}) - it_res_con_var = ET.SubElement(it_res_con, "conditionvar") - it_res_con_var_equal = ET.SubElement(it_res_con_var, "varequal", {"respident": question_lid}) - it_res_con_var_equal.text = question_ident_answer + str(mc_answer_index) - it_res_set_var = ET.SubElement(it_res_con, "setvar", {"action": "Set"}) - it_res_set_var.text = str(mc_answer.weight) if mc_answer.weight else "0.0000" - it_res_dis = ET.SubElement(it_res_con, "displayfeedback", {"feedbacktype": "Response", "linkrefid": question_ident_feedback + str(mc_answer_index)}) - - # Add Answer specific feedback - if mc_answer.answer_feedback: - self.generate_feedback(it, question_ident_feedback + str(mc_answer_index), mc_answer.answer_feedback) - mc_answer_index += 1 - - - def generate_true_false(self, it, question_ident, question): - self.itemetadata(it, "True/False", question) - self.itemproc_extension(it) - - question_lid = question_ident + "_LID" - question_ident_answer = question_ident + "_A" - question_ident_feedback = question_ident + "_IF" - - # Presentation Node - it_pre = ET.SubElement(it, "presentation") - it_pre_flow = ET.SubElement(it_pre, "flow") - - # Presentation -> Flow - it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") - - true_false = question.get_true_false() - # Presentation -> Material - it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) - question_text = question.text - it_pre_flow_mat_text.append(CDATA(question_text)) - - # Presentation -> Flow -> Response_extension - it_pre_flow_res = ET.SubElement(it_pre_flow, "response_extension") - it_pre_flow_res_display_style = ET.SubElement(it_pre_flow_res, "d2l_2p0:display_style") - it_pre_flow_res_display_style.text = "2" - it_pre_flow_res_enumeration = ET.SubElement(it_pre_flow_res, "d2l_2p0:enumeration") - it_pre_flow_res_enumeration.text = str(true_false.enumeration) if true_false.enumeration else "4" - it_pre_flow_res_grading_type = ET.SubElement(it_pre_flow_res, "d2l_2p0:grading_type") - it_pre_flow_res_grading_type.text = "0" - - # Presentation -> Flow -> Response_lid - it_pre_flow_lid = ET.SubElement(it_pre_flow, "response_lid", {"ident": question_lid, "rcardinality": "Single"}) - it_pre_flow_lid_render_choice = ET.SubElement(it_pre_flow_lid, "render_choice", {"shuffle": "no"}) - - # Reprocessing - it_res = ET.SubElement(it, "resprocessing") - - # Add General feedback - if question.feedback: - self.generate_feedback(it, question_ident, question.feedback) - - tf_index = 0 - answer_text = ["True", "False"] - while tf_index < 2: - # Presentation -> Flow -> Response_lid -> Render_choice -> Flow_label - flow = ET.SubElement(it_pre_flow_lid_render_choice, "flow_label", {"class": "Block"}) - response_label = ET.SubElement(flow, "response_label", {"ident": question_ident_answer + str(tf_index)}) - flow_mat = ET.SubElement(response_label, "flow_mat") - material = ET.SubElement(flow_mat, "material") - mattext = ET.SubElement(material, "mattext", {"texttype": "text/plain"}) - mattext.text = answer_text[tf_index] - - # Reprocessing -> Respcondition - it_res_con = ET.SubElement(it_res, "respcondition", {"title": "Response Condition" + str(tf_index)}) - it_res_con_var = ET.SubElement(it_res_con, "conditionvar") - it_res_con_var_equal = ET.SubElement(it_res_con_var, "varequal", {"respident": question_lid}) - it_res_con_var_equal.text = question_ident_answer + str(tf_index) - it_res_set_var = ET.SubElement(it_res_con, "setvar", {"action": "Set"}) - - if tf_index == 0: - current_weight = true_false.true_weight - current_feedback = true_false.true_feedback - else: - current_weight = true_false.false_weight - current_feedback = true_false.false_feedback - - it_res_set_var.text = str(current_weight) if current_weight else "0.0000" - it_res_dis = ET.SubElement(it_res_con, "displayfeedback", {"feedbacktype": "Response", "linkrefid": question_ident_feedback + str(tf_index)}) - - # Add Answer specific feedback - if current_feedback: - self.generate_feedback(it, question_ident_feedback + str(tf_index), current_feedback) - tf_index += 1 - - - def generate_fill_in_the_blanks(self, it, question_ident, question): - self.itemetadata(it, "Fill in the Blanks", question) - self.itemproc_extension(it) - - # Presentation Node - it_pre = ET.SubElement(it, "presentation") - it_pre_flow = ET.SubElement(it_pre, "flow") - # Presentation -> Flow - - idx = 1 - for fib in question.get_fibs(): - question_str = question_ident + str(idx) + "_STR" - question_ans = question_ident + str(idx) + "_ANS" - if fib.type == "fibanswer": - # Presentation -> Flow -> Response_str - it_pre_flow_str = ET.SubElement(it_pre_flow, "response_str", {"rcardinality": "Single", "ident": question_str}) - it_pre_flow_str_render = ET.SubElement(it_pre_flow_str, "render_fib", {"fibtype": "String", "prompt": "Box", "columns": "30", "rows": "1"}) - it_pre_flow_str_render_label = ET.SubElement(it_pre_flow_str_render, "response_label", {"ident": question_ans}) - idx += 1 - elif fib.type == "fibquestion": - # Presentation -> Flow -> Material - it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") - it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) - question_text = fib.text - it_pre_flow_mat_text.append(CDATA(question_text)) - - # Add hint - if question.hint: - self.generate_hint(it, question.hint) - - # Resprocessing - it_res = ET.SubElement(it, "resprocessing") - it_out = ET.SubElement(it_res, "outcomes") - - index = 1 - for fib_answers in question.get_fib_answers(): - answers = [a.strip() for a in fib_answers.text.split(",")] - - answer_weight = str(100.0 / len(question.get_fib_answers())) - question_ans = question_ident + str(index) + "_ANS" - for answer in answers: - it_res_con = ET.SubElement(it_res, "respcondition") - it_res_con_var = ET.SubElement(it_res_con, "conditionvar") - it_res_con_var_equal = ET.SubElement(it_res_con_var, "varequal", {"case": "no", "respident": question_ans}) - it_res_con_var_equal.text = answer - it_res_set_var = ET.SubElement(it_res_con, "setvar", {"action": "Set"}) - it_res_set_var.text = answer_weight - - it_out_score = ET.SubElement(it_out, "decvar", {"varname": "Blank_" + str(index), "maxvalue": "100", "minvalue": "0", "vartype": "Integer"}) - - index += 1 - - # Add General feedback - if question.feedback: - self.generate_feedback(it, question_ident, question.feedback) - - - def generate_multi_select(self, it, question_ident, question): - self.itemetadata(it, "Multi-Select", question) - self.itemproc_extension(it) - - question_lid = question_ident + "_LID" - question_ident_answer = question_ident + "_A" - question_ident_feedback = question_ident + "_IF" - - # Presentation Node - it_pre = ET.SubElement(it, "presentation") - it_pre_flow = ET.SubElement(it_pre, "flow") - - # Presentation -> Flow - it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") - - multiple_select = question.get_multiple_select() - # Presentation -> Material - it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) - question_text = question.text - it_pre_flow_mat_text.append(CDATA(question_text)) - - # Presentation -> Flow -> Response_extension - it_pre_flow_res = ET.SubElement(it_pre_flow, "response_extension") - it_pre_flow_res_display_style = ET.SubElement(it_pre_flow_res, "d2l_2p0:display_style") - it_pre_flow_res_display_style.text = "2" - it_pre_flow_res_enumeration = ET.SubElement(it_pre_flow_res, "d2l_2p0:enumeration") - it_pre_flow_res_enumeration.text = str(multiple_select.enumeration) if multiple_select.enumeration else "4" - it_pre_flow_res_grading_type = ET.SubElement(it_pre_flow_res, "d2l_2p0:grading_type") - it_pre_flow_res_grading_type.text = "2" - - # Presentation -> Flow -> Response_lid - it_pre_flow_lid = ET.SubElement(it_pre_flow, "response_lid", {"ident": question_lid, "rcardinality": "Multiple"}) - it_pre_flow_lid_render_choice = ET.SubElement(it_pre_flow_lid, "render_choice", {"shuffle": ("yes" if multiple_select.randomize else "no")}) - - # Add hint - if question.hint: - self.generate_hint(it, question.hint) - - # Reprocessing - it_res = ET.SubElement(it, "resprocessing") - it_out = ET.SubElement(it_res, "outcomes") - it_out_score = ET.SubElement(it_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "que_score", "minvalue": "0", "maxvalue": "100"}) - it_out_correct = ET.SubElement(it_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "D2L_Correct", "minvalue": "0"}) - it_out_incorrect = ET.SubElement(it_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "D2L_Incorrect", "minvalue": "0"}) - - # Add General feedback - if question.feedback: - self.generate_feedback(it, question_ident, question.feedback) - - ms_index = 1 - for ms_answer in multiple_select.get_multiple_select_answers(): - - # Presentation -> Flow -> Response_lid -> Render_choice -> Flow_label - flow = ET.SubElement(it_pre_flow_lid_render_choice, "flow_label", {"class": "Block"}) - response_label = ET.SubElement(flow, "response_label", {"ident": question_ident_answer + str(ms_index)}) - flow_mat = ET.SubElement(response_label, "flow_mat") - material = ET.SubElement(flow_mat, "material") - mattext = ET.SubElement(material, "mattext", {"texttype": "text/html"}) - mattext.text = ms_answer.answer - - # Reprocessing -> Respcondition - it_res_con = ET.SubElement(it_res, "respcondition", {"title": "Response Condition", "continue": "yes"}) - it_res_con_var = ET.SubElement(it_res_con, "conditionvar") - it_res_con_var_equal = ET.SubElement(it_res_con_var, "varequal", {"respident": question_lid}) - it_res_con_var_equal.text = question_ident_answer - - it_res_con_var_equal.text = question_ident_answer + str(ms_index) - if ms_answer.is_correct == True: - it_res_set_var = ET.SubElement(it_res_con, "setvar", {"varname": "D2L_Correct", "action": "Add"}) - else: - it_res_set_var = ET.SubElement(it_res_con, "setvar", {"varname": "D2L_Incorrect", "action": "Add"}) - - # Add Answer specific feedback - if ms_answer.answer_feedback: - self.generate_feedback(it, question_ident_feedback + str(ms_index), ms_answer.answer_feedback) - ms_index += 1 - - it_res_con = ET.SubElement(it_res, "respcondition") - it_res_set_var = ET.SubElement(it_res_con, "setvar", {"varname": "que_score", "action": "Set"}) - it_res_set_var.text = "D2L_Correct" - - - def generate_matching(self, it, question_ident, question): - self.itemetadata(it, "Matching", question) - self.itemproc_extension(it) - matching = question.get_matching() - question_ident_choice = question_ident + "_C" - question_ident_answer = question_ident + "_A" - question_ident_feedback = question_ident + "_IF" - - # Presentation Node - it_pre = ET.SubElement(it, "presentation") - it_pre_flow = ET.SubElement(it_pre, "flow") - - # Add hint - if question.hint: - self.generate_hint(it, question.hint) - - # Resprocessing Node - it_res = ET.SubElement(it, "resprocessing") - - # Resprocessing -> Outcomes - it_res_out = ET.SubElement(it_res, "outcomes") - it_res_out_dec_correct = ET.SubElement(it_res_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "D2L_Correct", "minvalue": "0", "maxvalue": "100"}) - it_res_out_dec_incorrect = ET.SubElement(it_res_out, "decvar", {"vartype": "Integer", "defaultval": "0", "varname": "D2L_Incorrect", "minvalue": "0", "maxvalue": "100"}) - it_res_out_dec_score = ET.SubElement(it_res_out, "decvar", {"vartype": "Decimal", "defaultval": "0", "varname": "que_score", "minvalue": "0", "maxvalue": "100"}) - - # Presentation -> Flow - it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") - - # Presentation -> Material - it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) - question_text = question.text - it_pre_flow_mat_text.append(CDATA(question_text)) - - # Presentation -> Flow -> Response_extension - it_pre_flow_res = ET.SubElement(it_pre_flow, "response_extension") - it_pre_flow_res_grading_type = ET.SubElement(it_pre_flow_res, "d2l_2p0:grading_type") - it_pre_flow_res_grading_type.text = '2' #str(matching.grading_type) - - # Presentation -> Flow -> Response_grp -> Render_choice - it_pre_flow_res_grp_ren = ET.Element("render_choice", {"shuffle": "yes"}) # add to response_grp later - it_pre_flow_res_grp_ren_flow = ET.SubElement(it_pre_flow_res_grp_ren, "flow_label", {"class": "Block"}) - - it_temp = ET.Element("temp") - matching_answers = matching.get_unique_matching_answers() - - ma_index = 1 - for matching_answer_text in matching_answers: - matching_answer_index = question_ident_answer + str(ma_index) - it_grp_ren_flow_lab = ET.SubElement(it_pre_flow_res_grp_ren_flow, "response_label", {"ident": matching_answer_index}) - it_grp_ren_flow_lab_flow = ET.SubElement(it_grp_ren_flow_lab, "flow_mat") - it_grp_ren_flow_lab_flow_mat = ET.SubElement(it_grp_ren_flow_lab_flow, "material") - it_grp_ren_flow_lab_flow_mat_text = ET.SubElement(it_grp_ren_flow_lab_flow_mat, "mattext", {"texttype": "text/html"}) - it_grp_ren_flow_lab_flow_mat_text.append(CDATA(matching_answer_text)) - - it_respcondition = ET.SubElement(it_temp, "respcondition") - it_respcondition_conditionvar = ET.SubElement(it_respcondition, "conditionvar") - it_respcondition_varequal = ET.SubElement(it_respcondition_conditionvar, "varequal") - it_respcondition_varequal.text = matching_answer_index - it_respcondition_setvar = ET.SubElement(it_respcondition, "setvar", {"action": "Add"}) - it_respcondition_setvar.text = "1" - - ma_index += 1 - - mc_index = 1 - for matching_choice in matching.get_matching_choices(): - matching_choice_index = question_ident_choice + str(mc_index) - - # Presentation -> Flow -> Response_grp - it_pre_flow_res_grp = ET.SubElement(it_pre_flow, "response_grp", {"respident": matching_choice_index, "rcardinality": "Single"}) - - # Presentation -> Flow -> Response_grp -> Material - it_pre_flow_res_grp_mat = ET.SubElement(it_pre_flow_res_grp, "material") - it_pre_flow_res_grp_mattext = ET.SubElement(it_pre_flow_res_grp_mat, "mattext", {"texttype": "text/html"}) - it_pre_flow_res_grp_mattext.append(CDATA(matching_choice.choice_text)) - it_pre_flow_res_grp.append(it_pre_flow_res_grp_ren) - - for respcondition in it_temp: - conditionvar = respcondition.find("conditionvar") - varequal = conditionvar.find("varequal") - varequal.set("respident", matching_choice_index) - setvar = respcondition.find("setvar") - answer_mattext = it_pre_flow.find("response_grp[@respident='" + matching_choice_index + "'].//response_label[@ident='" + varequal.text + "'].//mattext") - is_correct = matching_choice.has_matching_answer(answer_mattext[0].text) - if is_correct is True: - setvar.set("varname", "D2L_Correct") - else: - setvar.set("varname", "D2L_Incorrect") - it_res.append(copy.deepcopy(respcondition)) - mc_index += 1 - - match matching.grading_type: - case 0: - it_respcondition = ET.SubElement(it_res, "respcondition") - it_respcondition_var = ET.SubElement(it_respcondition, "conditionvar") - it_respcondition_var_other = ET.SubElement(it_respcondition_var, "other") - it_resp_setvar = ET.SubElement(it_respcondition, "setvar", {"varname": "que_score", "action": "Set"}) - it_resp_setvar.text = "D2L_Correct" - case 1: - it_respcondition = ET.SubElement(it_res, "respcondition") - it_respcondition_var = ET.SubElement(it_respcondition, "conditionvar") - it_respcondition_var_vargte = ET.SubElement(it_respcondition_var, "vargte", {"respident": "D2L_Incorrect"}) - it_respcondition_var_vargte.text = "0" - it_resp_setvar = ET.SubElement(it_respcondition, "setvar", {"varname": "que_score", "action": "Set"}) - it_resp_setvar.text = "0" - - it_respcondition2 = copy.deepcopy(it_respcondition) - it_resp_setvar2 = it_respcondition2.find("setvar") - it_resp_setvar2.text = "1" - it_res.append(it_respcondition2) - case 2: - it_respcondition = ET.SubElement(it_res, "respcondition") - it_respcondition_var = ET.SubElement(it_respcondition, "conditionvar") - it_respcondition_var_vargte = ET.SubElement(it_respcondition_var, "vargte", {"respident": "D2L_Incorrect"}) - it_respcondition_var_vargte.text = "D2L_Correct" - it_resp_setvar = ET.SubElement(it_respcondition, "setvar", {"varname": "que_score", "action": "Set"}) - it_resp_setvar.text = "0" - - it_respcondition2 = ET.SubElement(it_res, "respcondition") - it_respcondition_var2 = ET.SubElement(it_respcondition2, "conditionvar") - it_respcondition_var_varlt = ET.SubElement(it_respcondition_var2, "varlt", {"respident": "D2L_Incorrect"}) - it_respcondition_var_vargte.text = "D2L_Correct" - it_resp_setvar2 = ET.SubElement(it_respcondition2, "setvar", {"varname": "que_score", "action": "Set"}) - it_resp_setvar2.text = "D2L_Correct" - it_resp_setvar3 = ET.SubElement(it_respcondition2, "setvar", {"varname": "que_score", "action": "Subtract"}) - it_resp_setvar3.text = "D2L_Incorrect" - - # Add General feedback - if question.feedback: - self.generate_feedback(it, question_ident, question.feedback) - - - def generate_ordering(self, it, question_ident, question): - self.itemetadata(it, "Ordering", question) - self.itemproc_extension(it) - - question_o = question_ident + "_O" - question_ident_feedback = question_ident + "_IF" - - # Presentation Node - it_pre = ET.SubElement(it, "presentation") - it_pre_flow = ET.SubElement(it_pre, "flow") - - # Presentation -> Flow - - # Presentation -> Flow -> Material - it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") - it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) - question_text = question.text - it_pre_flow_mat_text.append(CDATA(question_text)) - - # Presentation -> Flow -> Response_extension - it_pre_flow_res_ext = ET.SubElement(it_pre_flow, "response_extension") - it_pre_flow_res_ext_grading = ET.SubElement(it_pre_flow_res_ext, "d2l_2p0:grading_type") - grading_type = 2 # Equally weighted, All or nothing, Right minus wrong - it_pre_flow_res_ext_grading.append(CDATA(grading_type)) - - # Presentation -> Flow -> Response_grp - it_pre_flow_res_grp = ET.SubElement(it_pre_flow, "response_grp", {"ident": question_o, "rcardinality": "Ordered"}) - it_pre_flow_res_grp_render = ET.SubElement(it_pre_flow_res_grp, "render_choice", {"shuffle": "yes"}) - it_pre_flow_res_grp_render_flow = ET.SubElement(it_pre_flow_res_grp_render, "flow_label", {"class": "Block"}) # populated in the loop - - # Add hint - if question.hint: - self.generate_hint(it, question.hint) - - # Resprocessing - it_res = ET.SubElement(it, "resprocessing") # populated in the loop - it_out = ET.SubElement(it_res, "outcomes") - - it_out_correct = ET.SubElement(it_out, "decvar", {"maxvalue": "100", "minvalue": "0", "varname": "D2L_Correct", "defaultval": "0", "vartype": "Integer"}) - it_out_incorrect = ET.SubElement(it_out, "decvar", {"minvalue": "0", "varname": "D2L_Incorrect", "defaultval": "0", "vartype": "Integer"}) - it_out_que_score = ET.SubElement(it_out, "decvar", {"minvalue": "0", "varname": "que_score", "defaultval": "0", "vartype": "Integer"}) - - it_res_con_other = ET.SubElement(it_res, "respcondition") - it_res_con_other_var = ET.SubElement(it_res_con_other, "conditionvar") - it_res_con_other_var_other = ET.SubElement(it_res_con_other_var, "other") - it_res_con_other_setvar = ET.SubElement(it_res_con_other, "setvar", {"varname": "que_score", "action": "Set"}) - it_res_con_other_setvar.text = "D2L_Correct" - - # Add General feedback - if question.feedback: - self.generate_feedback(it, question_ident, question.feedback) - - ord_index = 1 - for ord in question.get_orderings(): - ident_num = question_o + str(ord_index) - # Presentation -> Flow -> Response_grp -> response_label - it_pre_flow_res_grp_render_flow_res = ET.SubElement(it_pre_flow_res_grp_render_flow, "response_label", {"ident": ident_num}) - it_pre_flow_res_grp_render_flow_res_flow = ET.SubElement(it_pre_flow_res_grp_render_flow_res, "flow_mat") - it_pre_flow_res_grp_render_flow_res_flow_mat = ET.SubElement(it_pre_flow_res_grp_render_flow_res_flow, "material") - it_pre_flow_res_grp_render_flow_res_flow_mat_text = ET.SubElement(it_pre_flow_res_grp_render_flow_res_flow_mat, "mattext", {"texttype": "text/html"}) - question_text = ord.text - it_pre_flow_res_grp_render_flow_res_flow_mat_text.append(CDATA(question_text)) - - # Resprocessing -> Respcondition - it_res_con_correct = ET.SubElement(it_res, "respcondition", {"title": "Correct Condition"}) - it_res_con_correct_var = ET.SubElement(it_res_con_correct, "conditionvar") - it_res_con_correct_var_equal = ET.SubElement(it_res_con_correct_var, "varequal", {"respident": ident_num}) - it_res_con_correct_var_equal.text = str(ord_index) - it_res_con_correct_setvar = ET.SubElement(it_res_con_correct, "setvar", {"varname": "D2L_Correct", "action": "Add"}) - it_res_con_correct_setvar.text = str(1) - - it_res_con_incorrect = ET.SubElement(it_res, "respcondition", {"title": "Incorrect Condition"}) - it_res_con_incorrect_var = ET.SubElement(it_res_con_incorrect, "conditionvar") - it_res_con_incorrect_var_not = ET.SubElement(it_res_con_incorrect_var, "not") - it_res_con_incorrect_var_not_equal = ET.SubElement(it_res_con_incorrect_var_not, "varequal", {"respident": ident_num}) - it_res_con_incorrect_var_not_equal.text = str(ord_index) - it_res_con_incorrect_setvar = ET.SubElement(it_res_con_incorrect, "setvar", {"varname": "D2L_Incorrect", "action": "Add"}) - it_res_con_incorrect_setvar.text = str(1) - - # Add Answer specific feedback - if ord.ord_feedback: - self.generate_feedback(it, question_ident_feedback + str(ord_index), ord.ord_feedback) - ord_index += 1 - - - def generate_written_response(self, it, question_ident, question): - self.itemetadata(it, "Long Answer", question) - self.itemproc_extension(it) - - question_ident_str = question_ident + "_STR" - question_ident_la = question_ident + "_LA" - - # Presentation Node - it_pre = ET.SubElement(it, "presentation") - it_pre_flow = ET.SubElement(it_pre, "flow") - - written_response = question.get_written_response() - - # Presentation -> Flow - # Presentation -> Flow -> Material - it_pre_flow_mat = ET.SubElement(it_pre_flow, "material") - it_pre_flow_mat_text = ET.SubElement(it_pre_flow_mat, "mattext", {"texttype": "text/html"}) - question_text = question.text - it_pre_flow_mat_text.append(CDATA(question_text)) - - # Presentation -> Flow -> Response_extension - it_pre_flow_mat_res_ext = ET.SubElement(it_pre_flow, "response_extension") - it_pre_flow_mat_res_ext_sign = ET.SubElement(it_pre_flow_mat_res_ext, "d2l_2p0:has_signed_comments") - it_pre_flow_mat_res_ext_sign.append(CDATA("no")) - it_pre_flow_mat_res_ext_editor = ET.SubElement(it_pre_flow_mat_res_ext, "d2l_2p0:has_htmleditor") - - # Change it to "no" to deactivate student HTML editor answer - it_pre_flow_mat_res_ext_editor.append(CDATA("no")) - - # Presentation -> Flow -> Response_str - it_pre_flow_mat_res_str = ET.SubElement(it_pre_flow, "response_str", {"rcardinality": "Multiple", "ident": question_ident_str}) - it_pre_flow_mat_res_str_render = ET.SubElement(it_pre_flow_mat_res_str, "render_fib", {"fibtype": "String", "prompt": "Box", "columns": "100", "rows": "15"}) - it_pre_flow_mat_res_str_render_label = ET.SubElement(it_pre_flow_mat_res_str_render, "response_label", {"ident": question_ident_la}) - it_pre_flow_mat_res_str_render_label_mat = ET.SubElement(it_pre_flow_mat_res_str_render_label, "material") - it_pre_flow_mat_res_str_render_label_mat_text = ET.SubElement(it_pre_flow_mat_res_str_render_label_mat, "mattext", {"texttype": "text/html"}) - - # Add hint - if question.hint: - self.generate_hint(it, question.hint) - # Add General feedback - if question.feedback: - self.generate_feedback(it, question_ident, question.feedback) - # Initial_text - it_init_text = ET.SubElement(it, "initial_text") - it_init_text_mat = ET.SubElement(it, "initial_text_material") - it_init_text_mat_flow = ET.SubElement(it_init_text_mat, "flow_mat") - it_init_text_mat_flow_mat = ET.SubElement(it_init_text_mat_flow, "material") - it_init_text_mat_flow_mat_text = ET.SubElement(it_init_text_mat_flow_mat, "mattext", {"texttype": "text/html"}) - # Answer_key - it_ans = ET.SubElement(it, "answer_key") - it_ans_mat = ET.SubElement(it_ans, "answer_key_material") - it_ans_mat_flow = ET.SubElement(it_ans_mat, "flow_mat") - it_ans_mat_flow_mat = ET.SubElement(it_ans_mat_flow, "material") - it_ans_mat_flow_mat_text = ET.SubElement(it_ans_mat_flow_mat, "mattext", {"texttype": "text/html"}) - it_ans_mat_flow_mat_text.append(CDATA(written_response.answer_key)) diff --git a/api/scorm/manifest.py b/api/scorm/manifest.py deleted file mode 100644 index ba20feb..0000000 --- a/api/scorm/manifest.py +++ /dev/null @@ -1,22 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. - -class ManifestEntity(object): - resources = [] - - def __init__(self): - del self.resources[:] - - def add_resource(self, manifest_resource_entity): - self.resources.append(manifest_resource_entity) - - -class ManifestResourceEntity(object): - def __init__(self, identifier, resource_type, material_type, href, title = '', link_target = ''): - self.identifier = identifier - self.resource_type = resource_type - self.material_type = material_type - self.href = href - self.title = title - self.link_target = link_target \ No newline at end of file diff --git a/api/serializers.py b/api/serializers.py index 7cf4271..8c69fed 100644 --- a/api/serializers.py +++ b/api/serializers.py @@ -5,7 +5,7 @@ from rest_framework import serializers from .models import Matching, MatchingAnswer, MatchingChoice, Ordering, QuestionLibrary, Section, Question, MultipleChoice, MultipleChoiceAnswer, TrueFalse, Fib, MultipleSelect, MultipleSelectAnswer, WrittenResponse from django.conf import settings -from .process.process_helper import trim_md_to_html +from .formats.docx.process_helper import trim_md_to_html def validate_docx_file(value): @@ -53,7 +53,7 @@ def count_errors(questionlibrary): questionlibrary.save() -class WordToJsonSerializer(serializers.Serializer): +class DocxToJsonSerializer(serializers.Serializer): temp_file = serializers.FileField(validators=[validate_docx_file], max_length=100, allow_empty_file=False, use_url=True) @@ -72,8 +72,6 @@ def create(self, validated_data): newconversion.create_directory() newconversion.save() - newconversion.create_pandocstring() - newconversion.save() return newconversion def update(self, instance, validated_data): @@ -83,7 +81,7 @@ def update(self, instance, validated_data): class ScormToJsonSerializer(serializers.Serializer): - """Serializer for SCORM ZIP file upload to convert to JSON (mirrors WordToJsonSerializer).""" + """Serializer for SCORM ZIP file upload to convert to JSON (mirrors DocxToJsonSerializer).""" scorm_file = serializers.FileField(validators=[validate_zip_file], max_length=100, allow_empty_file=False, use_url=True) def create(self, validated_data): @@ -347,7 +345,7 @@ class Meta: class QuestionLibraryPackageSerializer(serializers.ModelSerializer): sections = SectionPackageSerializer(many=True, allow_null=True) - main_text = serializers.CharField(required=False, allow_null=True) + main_text = serializers.CharField(required=False, allow_null=True, allow_blank=True) class Meta: model = QuestionLibrary diff --git a/api/tasks.py b/api/tasks.py index abe8063..0fce822 100644 --- a/api/tasks.py +++ b/api/tasks.py @@ -12,14 +12,14 @@ from .logging.logging_adapter import FilenameLoggingAdapter from .models import EndAnswer, Question, QuestionLibrary -from .process.process_helper import (add_error_message, add_warning_message, html_to_plain, markdown_to_plain, markdown_to_html, trim_md_to_html, trim_text) -from .process.questionbuilder.fib import build_endanswer_FIB, build_inline_FIB -from .process.questionbuilder.matching import (build_endanswer_MAT, build_inline_MAT) -from .process.questionbuilder.multiplechoice import (build_endanswer_MC, build_inline_MC) -from .process.questionbuilder.multipleselect import (build_endanswer_MS, build_inline_MS) -from .process.questionbuilder.ordering import (build_endanswer_ORD, build_inline_ORD) -from .process.questionbuilder.truefalse import (build_endanswer_TF, build_inline_TF) -from .process.questionbuilder.writtenresponse import (build_endanswer_WR_with_list, build_inline_WR_with_keyword, build_inline_WR_with_list) +from .formats.docx.process_helper import (add_error_message, add_warning_message, html_to_plain, markdown_to_plain, markdown_to_html, trim_md_to_html, trim_text) +from .questions.model_builders.fib import build_endanswer_FIB, build_inline_FIB +from .questions.model_builders.matching import (build_endanswer_MAT, build_inline_MAT) +from .questions.model_builders.multiplechoice import (build_endanswer_MC, build_inline_MC) +from .questions.model_builders.multipleselect import (build_endanswer_MS, build_inline_MS) +from .questions.model_builders.ordering import (build_endanswer_ORD, build_inline_ORD) +from .questions.model_builders.truefalse import (build_endanswer_TF, build_inline_TF) +from .questions.model_builders.writtenresponse import (build_endanswer_WR_with_list, build_inline_WR_with_keyword, build_inline_WR_with_list) logger = logging.getLogger(__name__) loggercelery = get_task_logger(__name__) diff --git a/api/urls.py b/api/urls.py index f6d4e0b..69bd385 100644 --- a/api/urls.py +++ b/api/urls.py @@ -7,8 +7,8 @@ from django.conf import settings urlpatterns = [ - path('convert', views.WordToJson.as_view(), name='WordToJson'), - path('package', views.JsonToScorm.as_view(), name='JsonToScorm'), + path('docx-to-json', views.DocxToJson.as_view(), name='DocxToJson'), + path('json-to-scorm', views.JsonToScorm.as_view(), name='JsonToScorm'), path('scorm-to-json', views.ScormToJson.as_view(), name='ScormToJson'), path('json-to-docx', views.JsonToDocx.as_view(), name='JsonToDocx'), ] diff --git a/api/views.py b/api/views.py index e4bc244..5edf778 100644 --- a/api/views.py +++ b/api/views.py @@ -2,24 +2,22 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -import json -from rest_framework import viewsets -from .serializers import JsonToScormSerializer, QuestionLibraryPackageSerializer, WordToJsonSerializer, ScormToJsonSerializer -from rest_framework import generics +from .serializers import JsonToScormSerializer, DocxToJsonSerializer, ScormToJsonSerializer +from .pipelines.json_to_scorm import json_to_scorm, JsonToScormError +from .pipelines.scorm_to_json import scorm_to_json +from .pipelines.json_to_docx import json_to_docx, JsonToDocxError +from .pipelines.docx_to_json import docx_to_json, DocxToJsonError +from .pipelines.response_payload import build_status_payload from rest_framework.views import APIView -from rest_framework.response import Response -from django.http import FileResponse, JsonResponse -from rest_framework.permissions import IsAuthenticated, AllowAny +from django.http import JsonResponse +from rest_framework.permissions import AllowAny from rest_framework.authentication import TokenAuthentication from rest_framework.parsers import MultiPartParser from rest_framework.parsers import JSONParser -from django.core.files.base import ContentFile from django.conf import settings -from .models import QuestionLibrary - import logging logger = logging.getLogger(__name__) from .logging.contextfilter import QuestionlibraryFilenameFilter @@ -32,205 +30,81 @@ class TokenAuthenticationWithBearer(TokenAuthentication): def __init__(self): super(TokenAuthenticationWithBearer, self).__init__() -class WordToJson(APIView): +class DocxToJson(APIView): parser_classes = [MultiPartParser] permission_classes = [AllowAny] authentication_classes = [TokenAuthenticationWithBearer] - serializer_class = WordToJsonSerializer + serializer_class = DocxToJsonSerializer def post(self, request, format=None): - is_random = False if 'randomize' in request.POST: if request.POST['randomize'].lower() in ("true", "yes"): is_random = True file_obj = request.data['temp_file'] - serializer = WordToJsonSerializer(data={ + serializer = DocxToJsonSerializer(data={ 'temp_file': file_obj, 'randomize': is_random }) - if serializer.is_valid(): - instance = serializer.save() - - # question_library = QuestionLibrary.objects.first() - - # question_library = instance - - # ============== start the process ======== - from .process.process import process - process(instance) - - # question_library_serializer = QuestionLibraryPackageSerializer(question_library) - - - json_string = '{"main_title":"Exam Title","randomize_answer":false,"total_question_errors":"1","total_document_errors":"0","sections":[{"is_main_content":true,"title":"Section title","is_title_displayed":false,"text":null,"is_text_displayed":false,"shuffle":false,"questions":[{"title":"MC title","text":"Question text","points":3.5,"difficulty":3,"mandatory":false,"hint":"Question hint","feedback":"Question feedback","multiple_choice":[{"randomize":true,"enumeration":1,"multiple_choice_answers":[{"answer":"MC first answer text","answer_feedback":"MC first answer feedback","weight":100},{"answer":"MC second answer text","answer_feedback":"MC second answer feedback","weight":0}]}],"true_false":null,"fib":null,"multiple_select":null,"ordering":null,"matching":null,"written_response":null},{"title":"TF title","text":"Question text","points":1,"difficulty":1,"mandatory":false,"hint":"Question hint","feedback":"Question feedback","multiple_choice":null,"true_false":[{"true_weight":100,"true_feedback":"true feedback","false_weight":0,"false_feedback":"true feedback","enumeration":2}],"fib":null,"multiple_select":null,"ordering":null,"matching":null,"written_response":null},{"title":"MS title","text":"Question text","points":1,"difficulty":1,"mandatory":false,"hint":"Question hint","feedback":"Question feedback","multiple_choice":null,"true_false":null,"fib":null,"multiple_select":[{"randomize":true,"enumeration":1,"style":2,"multiple_select_answers":[{"answer":"MS first answer text","answer_feedback":"MS first answer feedback","is_correct":true},{"answer":"MS second answer text","answer_feedback":"MS second answer feedback","is_correct":true}]}],"ordering":null,"matching":null,"written_response":null},{"title":"WR title","text":"Question text","points":5,"difficulty":5,"mandatory":false,"hint":"Question hint","feedback":"Question feedback","multiple_choice":null,"true_false":null,"fib":null,"multiple_select":null,"ordering":null,"matching":null,"written_response":[{"enable_student_editor":false,"initial_text":null,"answer_key":"WR answer key","enable_attachments":false}]},{"title":"FIB title","text":"Question text","points":4,"difficulty":3,"mandatory":false,"hint":"Question hint","feedback":"Question feedback","multiple_choice":null,"true_false":null,"fib":[{"type":"fibquestion","text":"1+15?","order":1,"size":null,"weight":null},{"type":"fibanswer","text":"16","order":2,"size":3,"weight":100}],"multiple_select":null,"ordering":null,"matching":null,"written_response":null},{"title":"Ordering title","text":"Question text","points":6,"difficulty":2,"mandatory":false,"hint":"Question hint","feedback":"Question feedback","multiple_choice":null,"true_false":null,"fib":null,"multiple_select":null,"ordering":[{"text":"Order 1","order":1,"ord_feedback":"Ordering 1 feedback"},{"text":"Order 1","order":2,"ord_feedback":"Ordering 2 feedback"},{"text":"Order 1","order":3,"ord_feedback":"Ordering 3 feedback"}],"matching":null,"written_response":null},{"title":"Matching title","text":"Question text","points":6,"difficulty":2,"mandatory":false,"hint":"Question hint","feedback":"Question feedback","multiple_choice":null,"true_false":null,"fib":null,"multiple_select":null,"ordering":null,"matching":[{"grading_type":1,"matching_choices":[{"choice_text":"Choice 1","matching_answers":[{"answer_text":"Choice 1 answer a"},{"answer_text":"Choice 1 answer b"}]},{"choice_text":"Choice 2","matching_answers":[{"answer_text":"Choice 2 answer a"},{"answer_text":"Choice 2 answer b"}]}]}],"written_response":null}]}]}' - json_data = json.loads(json_string) - for item in json_data: - match item: - case "main_title": - print(json_data["main_title"]) - case "randomize_answer": - print(json_data["randomize_answer"]) - case "total_question_errors": - print(json_data["total_question_errors"]) - case "total_document_errors": - print(json_data["total_document_errors"]) - case "sections": - for section in json_data["sections"]: - print("\t", section["title"]) - print("\t", section["is_title_displayed"]) - print("\t", section["text"]) - print("\t", section["is_text_displayed"]) - print("\t", section["shuffle"]) - - for question in section["questions"]: - print("\t\t", question["title"]) - print("\t\t", question["text"]) - print("\t\t", question["points"]) - print("\t\t", question["difficulty"]) - print("\t\t", question["mandatory"]) - print("\t\t", question["hint"]) - print("\t\t", question["feedback"]) - - if question["multiple_choice"]: - print("\t\t\tmultiple_choice") - for multiple_choice in question["multiple_choice"]: - - print("\t\t\t\t", multiple_choice["randomize"]) - print("\t\t\t\t", multiple_choice["enumeration"]) - - print("\t\t\t\tmultiple_choices_answers") - for mc_answers in multiple_choice["multiple_choices_answers"]: - print("\t\t\t\t\t", mc_answers["answer"]) - print("\t\t\t\t\t", mc_answers["answer_feedback"]) - print("\t\t\t\t\t", mc_answers["weight"]) - print("") - - elif question["true_false"]: - for true_false in question["true_false"]: - print("\t\t\ttrue_false") - print("\t\t\t\t", true_false["true_weight"]) - print("\t\t\t\t", true_false["true_feedback"]) - print("\t\t\t\t", true_false["false_weight"]) - print("\t\t\t\t", true_false["false_feedback"]) - print("\t\t\t\t", true_false["enumeration"]) - - elif question["fib"] : - print("\t\t\tfib") - for fib in question["fib"]: - print("\t\t\t\t", fib["type"]) - print("\t\t\t\t", fib["text"]) - print("\t\t\t\t", fib["order"]) - print("\t\t\t\t", fib["size"]) - print("\t\t\t\t", fib["weight"]) - print("") - elif question["multiple_select"]: - for multiple_select in question["multiple_select"]: - print("\t\t\tmultiple_select") - print("\t\t\t\t", multiple_select["randomize"]) - print("\t\t\t\t", multiple_select["enumeration"]) - print("\t\t\t\t", multiple_select["style"]) - - print("\t\t\t\tmultiple_select_answers") - for ms_answers in multiple_select["multiple_select_answers"]: - print("\t\t\t\t\t", ms_answers["answer"]) - print("\t\t\t\t\t", ms_answers["answer_feedback"]) - print("\t\t\t\t\t", ms_answers["is_correct"]) - print("") - - elif question["written_response"]: - for written_response in question["written_response"]: - print("\t\t\twritten_response") - print("\t\t\t\t",written_response["enable_student_editor"]) - print("\t\t\t\t", written_response["initial_text"]) - print("\t\t\t\t", written_response["answer_key"]) - print("\t\t\t\t", written_response["enable_attachments"]) - - elif question["matching"]: - for matching in question["matching"]: - print("\t\t\tmatching") - print("\t\t\t\t", matching["grading_type"]) - - print("\t\t\t\tmatching_choices") - for matching_choice in matching["matching_choices"]: - print("\t\t\t\t\t", matching_choice["choice_text"]) - if matching_choice["matching_answers"]: - for matching_answer in matching_choice["matching_answers"]: - print("\t\t\t\t\t\t", matching_answer["answer_text"]) - print("") - - elif question["ordering"]: - print("\t\t\tordering") - for ordering in question["ordering"]: - print("\t\t\t\t", ordering["text"]) - print("\t\t\t\t", ordering["order"]) - print("\t\t\t\t", ordering["ord_feedback"]) - print("") - else: - print("******************************************************") - print("NO QUESTION TYPE\n\n") - print(question) - print("******************************************************") - - - - - - - instance.json_data = json_data - instance.save() - # print(instance.json_data) - instance.cleanup() + if not serializer.is_valid(): + error_payload = build_status_payload( + "Error", + "Validation failed", + serializer.errors, + questionlibrary=None, + process=None, + ) + return JsonResponse(error_payload, status=400) + + instance = serializer.save() + + try: + json_data, question_library = docx_to_json(instance, logger) + question_library.cleanup() return JsonResponse(json_data, status=200) - - return JsonResponse(serializer.errors, status=400) + except DocxToJsonError as exc: + error_payload = build_status_payload( + "Error", + str(exc), + "", + process=exc.process, + questionlibrary=instance, + ) + instance.cleanup() + return JsonResponse(error_payload, status=500) class JsonToScorm(APIView): parser_classes = [JSONParser] - permission_classes = [IsAuthenticated] + permission_classes = [AllowAny] authentication_classes = [TokenAuthenticationWithBearer] serializer_class = JsonToScormSerializer def post(self, request, format=None): - json_data = request.data - ql_serializer = QuestionLibraryPackageSerializer(data=json_data['data']) - if ql_serializer.is_valid(): - ql_instance = ql_serializer.save() - ql_instance.filter_main_title() - ql_instance.folder_path = settings.MEDIA_ROOT + str(ql_instance.id) - ql_instance.image_path = ql_instance.folder_path + settings.MEDIA_URL - ql_instance.create_directory() - ql_instance.save() - file_name = ql_instance.filtered_main_title - # if (ql_instance.total_question_errors + ql_instance.total_document_errors == 0): - ql_instance.create_xml_files() - ql_instance.zip_files() - file_response = FileResponse(ql_instance.zip_file) - file_response['Content-Disposition'] = 'attachment; filename="' + file_name + '"' + try: + file_response, ql_instance = json_to_scorm(json_data, logger) logger.addFilter(QuestionlibraryFilenameFilter(ql_instance)) - logger.info("[" + str(ql_instance.id) + "] " +">>>>>>>>>>Transaction Finished>>>>>>>>>>") - + logger.info(f"[{ql_instance.id}] Transaction Finished") ql_instance.cleanup() - return file_response - - return JsonResponse({"hostname": settings.APP_VERSION, "serializer_errors": ql_serializer.errors}, status=400) + except JsonToScormError as exc: + error_payload = build_status_payload( + "Error", + "Validation failed", + exc.errors, + questionlibrary=None, + process=None, + ) + return JsonResponse(error_payload, status=400) class ScormToJson(APIView): """ - Reverse API endpoint: Converts SCORM ZIP file to JSON (mirrors WordToJson). + Reverse API endpoint: Converts SCORM ZIP file to JSON (mirrors DocxToJson). This is step 1 of the reverse process: SCORM → JSON. - - Steps: - 1. Extract SCORM ZIP - 2. Parse XML (XmlReader) → populate Django models - 3. Serialize models to JSON using QuestionLibraryPackageSerializer - 4. Return JSON data """ parser_classes = [MultiPartParser] permission_classes = [AllowAny] @@ -243,65 +117,40 @@ def post(self, request, format=None): 'scorm_file': file_obj }) - if serializer.is_valid(): - instance = serializer.save() - - logger.addFilter(QuestionlibraryFilenameFilter(instance)) - logger.info(f"[{instance.id}] SCORM to JSON conversion started") - - try: - # Step 1: Extract SCORM ZIP and parse XML using XmlReader - from .scorm.XmlReader import XmlReader - from os import path - - # Get the SCORM ZIP file path - scorm_zip_path = instance.temp_file.path - - # Extract and parse SCORM XML - xml_reader = XmlReader(scorm_zip_path, extract_to_path=path.join(instance.folder_path, 'scorm_extract')) - - # Step 2: Populate Django models from parsed XML - question_library = xml_reader.populate_django_models(instance) - - # Step 3: Serialize models to JSON (same format as WordToJson returns) - from .serializers import QuestionLibraryPackageSerializer - ql_serializer = QuestionLibraryPackageSerializer(question_library) - json_data = ql_serializer.data - - # Add error counts (similar to WordToJson) - from .serializers import count_errors - count_errors(question_library) - json_data['total_question_errors'] = str(question_library.total_question_errors or 0) - json_data['total_document_errors'] = str(question_library.total_document_errors or 0) - - instance.json_data = json_data - instance.save() - - logger.addFilter(QuestionlibraryFilenameFilter(instance)) - logger.info(f"[{instance.id}] SCORM to JSON conversion completed") - - instance.cleanup() - - return JsonResponse(json_data, status=200) - - except Exception as e: - logger.error(f"SCORM to JSON conversion failed: {str(e)}") - instance.cleanup() - return JsonResponse({"error": str(e)}, status=500) - - return JsonResponse(serializer.errors, status=400) + if not serializer.is_valid(): + error_payload = build_status_payload( + "Error", + "Validation failed", + serializer.errors, + questionlibrary=None, + process=None, + ) + return JsonResponse(error_payload, status=400) + + instance = serializer.save() + logger.addFilter(QuestionlibraryFilenameFilter(instance)) + + try: + json_data, question_library = scorm_to_json(instance, logger) + instance.cleanup() + return JsonResponse(json_data, status=200) + except Exception as e: + logger.error(f"SCORM to JSON conversion failed: {str(e)}") + error_payload = build_status_payload( + "Error", + str(e), + "", + questionlibrary=instance, + process=None, + ) + instance.cleanup() + return JsonResponse(error_payload, status=500) class JsonToDocx(APIView): """ Reverse API endpoint: Converts JSON to DOCX (mirrors JsonToScorm). This is step 2 of the reverse process: JSON → DOCX. - - Steps: - 1. Deserialize JSON to Django models (using QuestionLibraryPackageSerializer) - 2. Convert models to markdown (format_to_markdown) - 3. Convert markdown to DOCX using Pandoc - 4. Return DOCX file """ parser_classes = [JSONParser] permission_classes = [AllowAny] @@ -310,251 +159,31 @@ class JsonToDocx(APIView): def post(self, request, format=None): json_data = request.data - - # Use the same serializer as JsonToScorm to deserialize JSON to models - ql_serializer = QuestionLibraryPackageSerializer(data=json_data.get('data', json_data)) - if ql_serializer.is_valid(): - ql_instance = ql_serializer.save() - ql_instance.filter_main_title() - ql_instance.folder_path = settings.MEDIA_ROOT + str(ql_instance.id) - ql_instance.image_path = ql_instance.folder_path + settings.MEDIA_URL - ql_instance.create_directory() - ql_instance.save() - - logger.addFilter(QuestionlibraryFilenameFilter(ql_instance)) - logger.info(f"[{ql_instance.id}] JSON to DOCX conversion started") - - try: - # Step 1: Convert Django models to markdown (matching formatter_output format) - from .scorm.XmlReader import XmlReader - from os import path - import pypandoc - import re - - # Create XmlReader instance (we only need the format_to_markdown method) - # Since we don't need to parse XML, we create a minimal instance - xml_reader = object.__new__(XmlReader) # Create instance without calling __init__ - markdown_text = xml_reader.format_to_markdown(ql_instance) - - # Extract base64 images from HTML img tags and save as files - # Pandoc doesn't support base64 data URIs when converting markdown to DOCX - # So we need to extract them to files and use file references - import base64 - import uuid - import os - import re as re_module - image_counter = 0 - base64_pattern = r']*?)src=["\'](data:image/([^;]+);base64,([^"\']+))["\']([^>]*?)>' - - def replace_base64_with_file(match): - nonlocal image_counter - before_src = match.group(1) - full_data_uri = match.group(2) - image_type = match.group(3) # png, jpeg, etc. - base64_data = match.group(4) - after_src = match.group(5) - - try: - # Decode base64 image - image_data = base64.b64decode(base64_data) - - # Determine file extension from MIME type - ext_map = { - 'png': 'png', - 'jpeg': 'jpg', - 'jpg': 'jpg', - 'gif': 'gif', - 'svg+xml': 'svg', - 'webp': 'webp' - } - ext = ext_map.get(image_type.lower(), 'png') - - # Save image to temporary file - image_filename = f"image_{image_counter}_{uuid.uuid4().hex[:8]}.{ext}" - image_path = path.join(ql_instance.folder_path, image_filename) - - with open(image_path, 'wb') as img_file: - img_file.write(image_data) - - image_counter += 1 - logger.info(f"Extracted base64 image to file: {image_filename} ({len(image_data)} bytes)") - - # Extract alt text if present - alt_match = re.search(r'alt=["\']([^"\']*)["\']', before_src + after_src) - alt_text = alt_match.group(1) if alt_match else 'image' - - # Use markdown image syntax with relative path (filename only) - # We'll change working directory to folder_path before Pandoc conversion - markdown_image = f'![{alt_text}]({image_filename})' - logger.debug(f"Replacing base64 img tag with markdown: {markdown_image}") - return markdown_image - except Exception as e: - logger.error(f"Error extracting base64 image: {str(e)}") - # Return original if extraction fails - return match.group(0) - - # Replace all base64 img tags with file references - markdown_text = re.sub(base64_pattern, replace_base64_with_file, markdown_text) - logger.info(f"Extracted {image_counter} base64 images to files") - - # Step 2: Convert markdown to DOCX using Pandoc (reverse of run_pandoc_task) - # Use main_title if it exists, otherwise use filtered_main_title - if ql_instance.main_title: - # Clean main_title for filename (remove invalid characters, limit length) - filename = ql_instance.main_title.strip() - filename = re.sub(r'[<>:"/\\|?*]', '', filename) # Remove invalid filename characters - filename = re.sub(r'\s+', '_', filename) # Replace spaces with underscores - filename = filename[:100] # Limit length - if not filename: - filename = ql_instance.filtered_main_title - else: - filename = ql_instance.filtered_main_title - - docx_filename = f"{filename}.docx" - docx_path = path.join(ql_instance.folder_path, docx_filename) - - # Convert markdown to DOCX - # Use similar settings as the forward conversion but in reverse - # Get absolute paths for lua filters before changing directory - import os as os_module - # Calculate base directory (project root) - views.py is in api/, so go up one level - current_file_dir = os_module.path.dirname(os_module.path.abspath(__file__)) # /code/api - base_dir = os_module.path.dirname(current_file_dir) # /code - mdblockquotePath = os_module.path.join(base_dir, "pandoc", "pandoc-filters", "mdblockquote.lua") - emptyparaPath = os_module.path.join(base_dir, "pandoc", "pandoc-filters", "emptypara.lua") - # Make paths absolute - mdblockquotePath = os_module.path.abspath(mdblockquotePath) - emptyparaPath = os_module.path.abspath(emptyparaPath) - logger.debug(f"Lua filter paths: mdblockquote={mdblockquotePath}, emptypara={emptyparaPath}") - - # Create temporary markdown file - temp_md_path = path.join(ql_instance.folder_path, "temp_markdown.md") - with open(temp_md_path, 'w', encoding='utf-8') as f: - f.write(markdown_text) - - # Log markdown preview and verify image file references - # Check for image file references in markdown - import re as re_module - import glob - # Check for markdown image syntax with file references - file_refs = re_module.findall(r'!\[.*?\]\((image_\d+_[^)]+)\)', markdown_text) - logger.info(f"Found {len(file_refs)} image file references in markdown") - # List image files in the folder and their sizes - image_files = glob.glob(path.join(ql_instance.folder_path, "image_*.*")) - image_info = [] - total_image_size = 0 - for img_file in image_files: - if path.exists(img_file): - img_size = path.getsize(img_file) - total_image_size += img_size - img_size_mb = img_size / (1024 * 1024) - image_info.append(f"{path.basename(img_file)} ({img_size_mb:.2f} MB, {img_size} bytes)") - if len(image_files) > 0: - logger.info(f"Found {len(image_files)} image files in folder:") - for info in image_info: - logger.info(f" - {info}") - logger.info(f"Total image size: {total_image_size / (1024 * 1024):.2f} MB ({total_image_size} bytes)") - logger.info(f"Markdown file created at: {temp_md_path}") - - try: - # Convert markdown directly to DOCX - # Images are now file references, so Pandoc should be able to find and embed them - original_cwd = os_module.getcwd() - try: - os_module.chdir(ql_instance.folder_path) - # Use relative path since we changed directory - temp_md_rel_path = "temp_markdown.md" - docx_output_name = os_module.path.basename(docx_path) - logger.info(f"Converting markdown with image file references to DOCX (working dir: {os_module.getcwd()})") - # Verify images exist before conversion - import glob as glob_module - existing_images = glob_module.glob("image_*.*") - logger.info(f"Images in working directory before Pandoc: {existing_images}") - # Verify markdown has image references - with open(temp_md_rel_path, 'r', encoding='utf-8') as f: - md_content = f.read() - image_refs_in_md = re_module.findall(r'!\[.*?\]\((image_\d+_[^)]+)\)', md_content) - logger.info(f"Image references found in markdown file: {image_refs_in_md}") - # Call pandoc directly via subprocess to capture warnings/errors - import subprocess - pandoc_cmd = [ - "pandoc", - temp_md_rel_path, - "-f", - "markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars", - "-t", - "docx+empty_paragraphs", - "-o", - docx_output_name, - "--no-highlight", - "--preserve-tabs", - "--wrap=preserve", - "--indent=false", - "--mathml", - "--ascii", - "--lua-filter=" + mdblockquotePath, - "--lua-filter=" + emptyparaPath, - ] - logger.info(f"Running pandoc command: {' '.join(pandoc_cmd)}") - result = subprocess.run( - pandoc_cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - if result.returncode != 0: - logger.error(f"Pandoc failed (exit {result.returncode}): {result.stderr}") - raise Exception(f"Pandoc failed: {result.stderr}") - if result.stderr: - logger.warning(f"Pandoc warnings: {result.stderr}") - logger.info(f"Pandoc markdown to DOCX conversion completed") - finally: - os_module.chdir(original_cwd) - finally: - # Clean up temporary markdown file - if path.exists(temp_md_path): - from os import remove - remove(temp_md_path) - - # Clean up temporary image files - import glob - image_files = glob.glob(path.join(ql_instance.folder_path, "image_*.png")) + \ - glob.glob(path.join(ql_instance.folder_path, "image_*.jpg")) + \ - glob.glob(path.join(ql_instance.folder_path, "image_*.jpeg")) + \ - glob.glob(path.join(ql_instance.folder_path, "image_*.gif")) + \ - glob.glob(path.join(ql_instance.folder_path, "image_*.svg")) + \ - glob.glob(path.join(ql_instance.folder_path, "image_*.webp")) - for img_file in image_files: - try: - if path.exists(img_file): - remove(img_file) - except Exception as e: - logger.warning(f"Could not remove temporary image file {img_file}: {str(e)}") - - # Step 3: Return DOCX file - from django.core.files import File - with open(docx_path, 'rb') as f: - ql_instance.temp_file.save(docx_filename, File(f), save=True) - - file_response = FileResponse(ql_instance.temp_file) - file_response['Content-Disposition'] = f'attachment; filename="{docx_filename}"' - - # Log DOCX file size - docx_size_bytes = path.getsize(docx_path) - docx_size_mb = docx_size_bytes / (1024 * 1024) - logger.addFilter(QuestionlibraryFilenameFilter(ql_instance)) - logger.info(f"[{ql_instance.id}] JSON to DOCX conversion completed - DOCX size: {docx_size_mb:.2f} MB ({docx_size_bytes} bytes)") - - ql_instance.cleanup() - - return file_response - - except Exception as e: - logger.error(f"JSON to DOCX conversion failed: {str(e)}") - ql_instance.cleanup() - return JsonResponse({"error": str(e)}, status=500) - - return JsonResponse({"hostname": settings.APP_VERSION, "serializer_errors": ql_serializer.errors}, status=400) + try: + file_response, ql_instance = json_to_docx(json_data, logger) + except JsonToDocxError as exc: + error_payload = build_status_payload( + "Error", + "Validation failed", + exc.errors, + questionlibrary=None, + process=None, + ) + return JsonResponse(error_payload, status=400) + except Exception as e: + logger.error(f"JSON to DOCX conversion failed: {str(e)}") + error_payload = build_status_payload( + "Error", + str(e), + "", + questionlibrary=None, + process=None, + ) + return JsonResponse(error_payload, status=500) + + ql_instance.cleanup() + + return file_response class RootPath(APIView): diff --git a/qcon/settings.py b/qcon/settings.py index 1c5ac90..6b76b61 100644 --- a/qcon/settings.py +++ b/qcon/settings.py @@ -96,7 +96,6 @@ def get_secret(name: str, default: str = None, required: bool = False, subdirect # Local Apps 'api', - 'restapi' ] diff --git a/qcon/urls.py b/qcon/urls.py index d9a0808..963d3c4 100644 --- a/qcon/urls.py +++ b/qcon/urls.py @@ -22,11 +22,9 @@ # from django.contrib.staticfiles.urls import staticfiles_urlpatterns from django.conf import settings from api import views -from restapi import views urlpatterns = [ path('', include('api.urls')), - path('api/', include('restapi.urls')), path('', views.RootPath.as_view(), name='root') ] diff --git a/restapi/__init__.py b/restapi/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/restapi/admin.py b/restapi/admin.py deleted file mode 100644 index 8c38f3f..0000000 --- a/restapi/admin.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.contrib import admin - -# Register your models here. diff --git a/restapi/apps.py b/restapi/apps.py deleted file mode 100644 index 4d2371e..0000000 --- a/restapi/apps.py +++ /dev/null @@ -1,19 +0,0 @@ -from django.apps import AppConfig -from django.conf import settings -import sys -import logging -logger = logging.getLogger(__name__) - -class RestapiConfig(AppConfig): - default_auto_field = 'django.db.models.BigAutoField' - name = 'restapi' - - def ready(self): - if 'runserver' in sys.argv or 'qcon.asgi:application' in sys.argv: - logger.info("APP_VERSION: " + settings.APP_VERSION) - logger.info("IMAGE_TAG: " + settings.IMAGE_TAG) - logger.info("IMAGE_NAME: " + settings.IMAGE_NAME) - if 'runserver' in sys.argv: - logger.warning("qconapi has started in Dev Mode") - else: - logger.info("qconapi has started") diff --git a/restapi/logging/ErrorTypes.py b/restapi/logging/ErrorTypes.py deleted file mode 100644 index 37eb840..0000000 --- a/restapi/logging/ErrorTypes.py +++ /dev/null @@ -1,176 +0,0 @@ - -class MarkDownConversionError(Exception): - def __init__(self, reason, message="MarkDownConversionError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class NoTypeDeterminedError(Exception): - def __init__(self, reason, message="NoTypeDeterminedError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - - -class InlineNoTypeError(Exception): - def __init__(self, reason, message="InlineNoTypeError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class EndAnswerNoTypeError(Exception): - def __init__(self, reason, message="EndAnswerNoTypeError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class EMFImageError(Exception): - def __init__(self, reason, message="EMFImageError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MATEndStructureError(Exception): - def __init__(self, reason, message="MATEndStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MATInlineStructureError(Exception): - def __init__(self, reason, message="MATInlineStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MATNoMatchError(Exception): - def __init__(self, reason, message="MATNoMatchError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MATMissingChoiceError(Exception): - def __init__(self, reason, message="MATMissingChoiceError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MATMissingAnswerError(Exception): - def __init__(self, reason, message="MATMissingAnswerError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MATMissingOptionError(Exception): - def __init__(self, reason, message="MATMissingOptionError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class FIBEndStructureError(Exception): - def __init__(self, reason, message="FIBEndStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class FIBInlineStructureError(Exception): - def __init__(self, reason, message="FIBInlineStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class TFEndStructureError(Exception): - def __init__(self, reason, message="TFEndStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class TFInlineStructureError(Exception): - def __init__(self, reason, message="TFInlineStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class TFNoAnswerError(Exception): - def __init__(self, reason, message="TFNoAnswerError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class TFSelectedAnswerError(Exception): - def __init__(self, reason, message="TFSelectedAnswerError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MCEndStructureError(Exception): - def __init__(self, reason, message="MCEndStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MCInlineStructureError(Exception): - def __init__(self, reason, message="MCInlineStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class ORDEndStructureError(Exception): - def __init__(self, reason, message="ORDEndStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class ORDInlineStructureError(Exception): - def __init__(self, reason, message="ORDInlineStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MSEndStructureError(Exception): - def __init__(self, reason, message="MSEndStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class MSInlineStructureError(Exception): - def __init__(self, reason, message="MSInlineStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class WREndStructureError(Exception): - def __init__(self, reason, message="WREndStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - -class WRInlineStructureError(Exception): - def __init__(self, reason, message="WRInlineStructureError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' diff --git a/restapi/logging/WarningTypes.py b/restapi/logging/WarningTypes.py deleted file mode 100644 index d7fadc1..0000000 --- a/restapi/logging/WarningTypes.py +++ /dev/null @@ -1,61 +0,0 @@ -class MCEndAnswerExistWarning(Exception): - def __init__(self, reason, message="MCEndAnswerExistWarning"): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' - - -class MSEndAnswerExistWarning(Exception): - def __init__(self, reason, message="MSEndAnswerExistWarning"): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' - - -class WREndAnswerExistWarning(Exception): - def __init__(self, reason, message="WREndAnswerExistWarning"): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' - - -class RespondusTypeEWarning(Exception): - def __init__(self, reason, message="RespondusTypeEWarning"): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' - - -class RespondusTypeMRWarning(Exception): - def __init__(self, reason, message="RespondusTypeMRWarning"): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' - - -class RespondusTypeFMBWarning(Exception): - def __init__(self, reason, message="RespondusTypeFMBWarning"): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' - - -class RespondusTypeMTWarning(Exception): - def __init__(self, reason, message="RespondusTypeMTWarning"): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' diff --git a/restapi/logging/contextfilter.py b/restapi/logging/contextfilter.py deleted file mode 100644 index f5d6b5b..0000000 --- a/restapi/logging/contextfilter.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import logging - -class QuestionlibraryFilenameFilter(logging.Filter): - def __init__(self, questionlibrary=None): - self.questionlibrary = questionlibrary - def filter(self, record): - if self.questionlibrary==None: - # record.file = '--' - pass - else: - if self.questionlibrary.temp_file.name != None: - # record.file = 'docx_filename:' + os.path.basename(self.questionlibrary.temp_file.name) - # filename = 'docx_filename:' + os.path.basename(self.questionlibrary.temp_file.name) - filename = 'docx_filename:' + self.questionlibrary.temp_file.name - record.msg = filename + " >>> " + str(record.getMessage()) - elif self.questionlibrary.filtered_main_title != None: - # record.file = 'filtered_main_title:' + os.path.basename(self.questionlibrary.filtered_main_title) - titlename = 'filtered_main_title:' + os.path.basename(self.questionlibrary.filtered_main_title) - record.msg = titlename + " >>> " + str(record.getMessage()) - else: - # record.file = '--' - pass - return True - diff --git a/restapi/logging/logging_adapter.py b/restapi/logging/logging_adapter.py deleted file mode 100644 index 26566f2..0000000 --- a/restapi/logging/logging_adapter.py +++ /dev/null @@ -1,22 +0,0 @@ -import logging - -class FilenameLoggingAdapter(logging.LoggerAdapter): - """ - This example adapter expects the passed in dict-like object to have a - 'connid' key, whose value in brackets is prepended to the log message. - """ - def process(self, msg, kwargs): - user_ip = "" - filename = "" - question = "" - - if 'user_ip' in self.extra: - user_ip = str(self.extra['user_ip']) - - if 'filename' in self.extra: - filename = str(self.extra['filename']) - - if 'question' in self.extra: - question = "#" + str(self.extra['question']) - - return f"{user_ip}:[{filename}]:{question} {msg}", kwargs diff --git a/restapi/models.py b/restapi/models.py deleted file mode 100644 index ad61906..0000000 --- a/restapi/models.py +++ /dev/null @@ -1,818 +0,0 @@ -# from django.db import models -from .tasks import run_pandoc_task -from .process.common.extract_images import extract_images -from .process.formatter.convert_txt import convert_txt -from .process.formatter.fix_numbering import fix_numbering -# from .process.formatter.formatter import run_formatter_parser -from .process.common.restore_images import restore_images - -import xml.etree.ElementTree as ET - -import logging -logger = logging.getLogger(__name__) -import os -import subprocess -import re - -from .logging.ErrorTypes import (WRInlineStructureError, WREndStructureError, MSInlineStructureError, MSEndStructureError, ORDInlineStructureError, ORDEndStructureError, MCInlineStructureError, MCEndStructureError, TFInlineStructureError, TFEndStructureError, FIBInlineStructureError, FIBEndStructureError, MATInlineStructureError, MATEndStructureError, InlineNoTypeError, EndAnswerNoTypeError, NoTypeDeterminedError, MarkDownConversionError) -from .logging.WarningTypes import (RespondusTypeEWarning, RespondusTypeMRWarning, RespondusTypeFMBWarning, RespondusTypeMTWarning) - -import pypandoc -from enum import Enum -from django.utils.translation import gettext_lazy as _ - -class Format: - - ''' - main variables(part of final result) - ''' - filename = None - maincontent_title = None - body = None - end_answers = None - ''' - intermediary variables - ''' - pandoc_result = None - content_after_images_extracted = None - content_converted_to_txt = None - content_numbering_fixed = None - images_list = [] - formatter_result = None - - def __init__(self, temp_file_path, temp_file_name, filename, maincontent_title = None): - self.temp_file_path = temp_file_path - self.temp_file_name = temp_file_name - self.filename = filename - self.maincontent_title = maincontent_title - - def convert_pandoc(self): - try: - result = run_pandoc_task.apply_async(kwargs={"temp_file_path": self.temp_file_path, - "filename": self.temp_file_name }, - ignore_result=False) - self.pandoc_result = result.get() - except Exception as e: - raise Exception(str(e)) - return self - - def extract_images(self): - self.content_after_images_extracted, self.images_list = extract_images(self.pandoc_result) - return self - - def convert_txt(self): - self.content_converted_to_txt = convert_txt(self.temp_file_path, self.filename) - return self - - def fix_numbering(self): - self.content_numbering_fixed = fix_numbering(self.content_after_images_extracted, self.content_converted_to_txt) - return self - - def run_formatter(self): - try: - self.formatter_result = self.run_formatter_parser(self.content_numbering_fixed) - - - - - - - if 'maincontent_title' in self.formatter_result.keys(): - self.maincontent_title = self.formatter_result['maincontent_title'] - if 'body' in self.formatter_result.keys(): - self.body = self.formatter_result['body'] - if 'end_answers' in self.formatter_result.keys(): - self.end_answers = self.formatter_result['end_answers'] - except Exception as e: - raise Exception(str(e)) - return self - - def restore_images(self): - self.body = restore_images(self.body, self.images_list) - - - def run_formatter_parser(self, content): - root = None - - try: - os.chdir('/antlr_build/formatter') - result = subprocess.run('java -cp formatter.jar:* formatter', - shell=True, - input=content.encode("utf-8"), - capture_output=True) - os.chdir('/code') - root = ET.fromstring(result.stdout.decode("utf-8")) - except: - raise FormatterError("Internal error while converting file") - - logger.debug("starting formatter extraction") - - format = {} - - # # ==================================== MAINCONTENT TITLE - maincontenttitle = root.find('maincontent_title') - logger.debug("checking maincontent title") - if maincontenttitle is not None: - main_title = (maincontenttitle.text).strip() - if main_title: - # format["maincontent_title"] = (trim_text(main_title)).lstrip('# ') - format["maincontent_title"] = main_title - else: - format["maincontent_title"] = None - - # # ==================================== BODY - body = root.find('body') - logger.debug("checking formatter body") - if body is not None: - # questionlibrary.formatter_output = body.text.rstrip() + "\n" - # questionlibrary.save() - format["body"] = body.text.rstrip() + "\n" - else: - raise FormatterError("document body not found") - - # ==================================== END ANSWERS - - end_answers = root.find('end_answers') - logger.debug("checking for endanswers block") - if end_answers is not None: - logger.debug("endanswers block found") - # questionlibrary.end_answers_raw = end_answers.text - # questionlibrary.save() - format["end_answers"] = end_answers.text - else: - logger.info("No endanswers block found") - format["end_answers"] = None - - return format - - -class FormatterError(Exception): - def __init__(self, reason, message="Formatter Error"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - - -class BaseQuestion: - def __init__(self, questioncontent=None): - self.questioncontent = questioncontent - self.basetextanswers.clear() - self.answers.clear() - self.question_header_type = None - self.question_header_title = None - self.question_header_points = None - self.questiontype_by_user = None - self.questiontype_processed = None - - - index = None - number_provided = None - questioncontent = None #raw content - - question_header_type = None - question_header_title = None - question_header_points = None - - questiontype_by_user = None - questiontype_processed = None - wr_answer = None - - feedback = None - hint = None - - endanswer = None - - warning_message = [] - info_message = [] - error_message = [] - - ''' - These Vars are only used for processing and not part of final result - ''' - line_elements = None - question_body_part_list = None - - answers = [] - basetextanswers = [] - - - def get_line_elements(self): - self.questioncontent = os.linesep + self.questioncontent - os.chdir('/antlr_build/questionparser') - popen = subprocess.Popen( - 'java -cp questionparser.jar:* questionparser', - shell=True, - stdout=subprocess.PIPE, - stdin=subprocess.PIPE, - stderr=subprocess.PIPE - ) - result, errors = popen.communicate(input=self.questioncontent.encode("utf-8")) - popen.stdout.close() - return_code = popen.wait() - os.chdir('/code') - try: - self.line_elements = ET.fromstring(result.decode("utf-8")) - except Exception as e: - raise Exception(str(e)) - - - return self - - def extract_question_header_elements(self): - - question_header_type = self.line_elements.find('type') - if question_header_type is not None: - self.question_header_type = self.trim_text(question_header_type.text) - - - question_header_title = self.line_elements.find('title') - if question_header_title is not None: - self.question_header_title = self.trim_text(question_header_title.text) - - question_header_points = self.line_elements.find('points') - if question_header_points is not None: - filterpoint = re.search("\d+((.|,)\d+)?", question_header_points.text) - self.question_header_points = float(filterpoint.group()) - return self - - def get_question_body_parts_list (self): - question_body = self.line_elements.find("question_body") - if question_body is None: - raise Exception("Question_body empty") - - self.question_body_part_list = question_body.findall("question_body_part") - if self.question_body_part_list is None: - raise Exception("Question_body empty") - return self - - def get_number_provided(self): - try: - # save question number that was provided - number_provided = self.question_body_part_list[0].find('prefix') - if number_provided is not None: - filter_question_number = re.search("\d+", number_provided.text) - self.number_provided = filter_question_number.group() - return self - # logger.debug("Finished getting question number") - except Exception as e: - raise Exception(f"failed to extract number_provided : {str(e)}") - - def separate_question_and_answers(self): - answer_list = [] - part_of_question_list = [] - try: - # logger.debug( f"#{str(question.number_provided)} Starting splitting body_part into question_content and answers block") - # only if there are multiple question_body parts then proceed to splitting - if (len(self.question_body_part_list) == 1) and (self.question_body_part_list[0].get('prefix_type') == 'NUMLIST_PREFIX'): - part_of_question_list.append(self.question_body_part_list[0]) - else: - # Filter out the last letter enumerated list so that it can be set as the answerlist - start_of_list_found = False - # Start iterating from the last item going up untill the index "a" is found and continue adding the rest of the lists as question content - for question_body_part in reversed(self.question_body_part_list): - if not start_of_list_found: - answer_list.append(question_body_part) - else: - part_of_question_list.append(question_body_part) - if question_body_part.get('prefix_type') == "LETTERLIST_PREFIX" or question_body_part.get('prefix_type') == "CORRECT_ANSWER": - check_index = ''.join(filter(str.isalpha, question_body_part.find('prefix').text.lower())) - if check_index == "a": - start_of_list_found = True - # because we started from the last item we need to reverse the list to bring in correct order - answer_list = answer_list[::-1] - part_of_question_list = part_of_question_list[::-1] - # logger.debug( f"#{str(question.number_provided)} Finished plitting body_part into question_content and answers block") - except Exception as e: - raise Exception(f"failed to split body_part into question_content and answers block : {e}") - - try: - # Combine feedback and answers - # Check if first item is LETTERLIST_PREFIX or CORRECT_ANSWER - if (answer_list[0].get('prefix_type') == "LETTERLIST_PREFIX" or answer_list[0].get('prefix_type') == "CORRECT_ANSWER"): - # raise Exception("First item in Answer list is not a Letterlist item") - for answer in answer_list: - if answer.get('prefix_type') == "LETTERLIST_PREFIX": - current_answer = { - "answer_prefix": answer.find('prefix').text, - "answer_content": answer.find('content').text, - "correct": False, - "feedback": None - } - self.answers.append(current_answer) - elif answer.get('prefix_type') == "CORRECT_ANSWER": - current_answer = { - "answer_prefix": answer.find('prefix').text, - "answer_content": answer.find('content').text, - "correct": True, - "feedback": None - } - self.answers.append(current_answer) - elif answer.get('prefix_type') == "NUMLIST_PREFIX": - current_answer = self.answers.pop() - current_answer.update({"content": current_answer.get("content") + answer.find('content').text}) - self.answers.append(current_answer) - elif answer.get('prefix_type') == "FEEDBACK": - current_answer = self.answers.pop() - current_answer.update({"feedback": answer.find('content').text}) - self.answers.append(current_answer) - elif answer.get('prefix_type') == "HINT": - continue - # logger.debug( f"#{str(question.number_provided)} Finished combining answer block elements items into answers") - except Exception as e: - raise Exception(f"failed to combine answer block elements items into one answers block{e}") - - - try: - # Combine question content, any lists, feedback and hint in one dict - question_from_xml = { - "question_content": "", - "feedback": "", - "hint": "" - } - for index, question_content_item in enumerate(part_of_question_list): - if question_content_item.get('prefix_type') == "FEEDBACK": - question_from_xml.update({"feedback": question_content_item.find('content').text}) - elif question_content_item.get('prefix_type') == "HINT": - question_from_xml.update({"hint": question_content_item.find('content').text}) - else: - question_content = question_from_xml.get("question_content") - question_content_to_append = "" - if index > 0: - question_content_to_append = question_content_item.find('prefix').text - question_content_to_append = question_content_to_append + question_content_item.find('content').text - question_from_xml.update({"question_content": question_content + question_content_to_append}) - - if question_from_xml is not None: - question_text = question_from_xml.get("question_content") - self.questioncontent = question_text - - self.wr_answer = self.line_elements.find("wr_answer") - question_feedback = question_from_xml.get("feedback") - if question_feedback is not None: - self.feedback = question_feedback - question_hint = question_from_xml.get("hint") - if question_hint is not None: - self.hint = question_hint - - except Exception as e: - raise Exception(f"failed to combine question content, any lists, feedback and hint in one dict") - - for answer in self.answers: - self.basetextanswers.append(BaseTextAnswer(answer)) - - return self - - def check_questiontype(self): - if self.endanswer == None: - self.questiontype_processed = self.__check_inline_questiontype() - else: - self.questiontype_processed = self.__check_endanswer_questiontype() - return self - - - def compare_user_type_with_processed_type(self): - match self.questiontype_by_user: - case 'WR' | 'E': - if self.questiontype_by_user == 'E': - self.__add_respondus_type_warning(type_found='E', type_recommended='WR') - if self.endanswer == None: - if not (self.questiontype_processed == 'inline_WR_keyword' or - self.questiontype_processed == 'inline_WR_list'): - self.__add_inline_type_error(type_found='WR') - else: - if not self.questiontype_processed == 'endanswer_WR': - self.__add_endanswer_type_error(type_found='WR') - case 'MS' | 'MR': - if self.questiontype_by_user == 'MR': - self.__add_respondus_type_warning(type_found='MR', type_recommended='MS') - if self.endanswer == None: - if not (self.questiontype_processed == 'inline_MS'): - self.__add_inline_type_error(type_found='MS') - else: - if not self.questiontype_processed == 'endanswer_MS': - self.__add_endanswer_type_error(type_found='MS') - case 'ORD': - if self.endanswer == None: - if not (self.questiontype_processed == 'inline_ORD'): - self.__add_inline_type_error(type_found='ORD') - else: - if not self.questiontype_processed == 'endanswer_ORD': - self.__add_endanswer_type_error(type_found='ORD') - case 'MC': - if self.endanswer == None: - if not (self.questiontype_processed == 'inline_MC'): - self.__add_inline_type_error(type_found='MC') - else: - if not self.questiontype_processed == 'endanswer_MC': - self.__add_endanswer_type_error(type_found='MC') - case 'TF': - if self.endanswer == None: - if not (self.questiontype_processed == 'inline_TF'): - self.__add_inline_type_error(type_found='TF') - else: - if not self.questiontype_processed == 'endanswer_TF': - self.__add_endanswer_type_error(type_found='TF') - case 'FIB' | 'FMB': - if self.questiontype_by_user == 'FMB': - self.__add_respondus_type_warning(type_found='FMB', type_recommended='FIB') - if self.endanswer == None: - if not (self.questiontype_processed == 'inline_FIB'): - self.__add_inline_type_error(type_found='FIB') - else: - if not self.questiontype_processed == 'endanswer_FIB': - self.__add_endanswer_type_error(type_found='FIB') - case 'MAT' | 'MT': - if self.questiontype_by_user == 'MT': - self.__add_respondus_type_warning(type_found='MT', type_recommended='MAT') - case _: - logger.debug("question type not given by user") - return self - - - def build_question(self): - match self.questiontype_processed: - case 'inline_MC': - build_inline_MC(question, answers, is_random, enumeration) - case 'endanswer_MC': - build_endanswer_MC(question, answers, endanswer, is_random, enumeration) - case 'inline_TF': - build_inline_TF(question, answers, enumeration) - case 'endanswer_TF': - build_endanswer_TF(question, answers, endanswer, enumeration) - case 'inline_MS': - build_inline_MS(question, answers, is_random, enumeration) - case 'endanswer_MS': - build_endanswer_MS(question, answers, endanswer, is_random, enumeration) - case 'inline_WR_keyword': - build_inline_WR_with_keyword(question, wr_answer) - case 'inline_WR_list': - build_inline_WR_with_list(question, answers) - case 'endanswer_WR': - build_endanswer_WR_with_list(question, endanswer, wr_answer) - case 'inline_FIB': - build_inline_FIB(question) - case 'endanswer_FIB': - build_endanswer_FIB(question, endanswer) - case 'inline_MAT': - build_inline_MAT(question, answers) - case 'endanswer_MAT': - build_endanswer_MAT(question, endanswer) - case 'inline_ORD': - build_inline_ORD(question, answers) - case 'endanswer_ORD': - build_endanswer_ORD(question, endanswer) - case 'inline_NO_TYPE': - error_message = "Cannot determined the inline question type." - add_error_message(question, error_message) - raise InlineNoTypeError(error_message) - case 'endanswer_NO_TYPE': - error_message = "Cannot determined the end answer question type." - add_error_message(question, error_message) - raise EndAnswerNoTypeError(error_message) - - - def __add_respondus_type_warning(self, type_found, type_recommended): - self.warning_message.append(f'Respondus format "Type: {type_found}" was found on the file. Please use "Type: {type_recommended}" instead.') - - def __add_inline_type_error(self, type_found): - self.error_message.append(f"Inline question structure doesn't conform to {type_found} type question format.") - - def __add_endanswer_type_error(self, type_found): - self.error_message.append(f"End answer question structure doesn't conform to {type_found} type question format.") - - - - def __check_inline_questiontype(self): - answers_length = len(self.answers) - marked_answers_count = 0 - unmarked_answers_count = 0 - matching_answers_count = 0 - KeywordTrueFound = False - KeywordFalseFound = False - - is_fib = re.search(r"\[(.*?)\]", self.questioncontent) - - if answers_length == 0: - if is_fib: - # ==================== FIB confirmed ==================== - logger.debug("Question Type determined: inline_FIB") - return 'inline_FIB' - - if self.wr_answer != None: - # ==================== WR confirmed ==================== - logger.debug("Question Type determined: inline_WR_keyword") - return 'inline_WR_keyword' - - for answer in self.answers: - # answer_text = markdown_to_plain(answer.find('content').text.lower()) - answer_text = self.markdown_to_plain(answer.get("answer_content").lower()) - answer_text = self.trim_text(answer_text) - is_correct = answer.get('correct') - if is_correct: - marked_answers_count += 1 - if not is_correct: - unmarked_answers_count += 1 - - if answer_text == 'true': - KeywordTrueFound = True - - if answer_text == 'false': - KeywordFalseFound = True - matching_answers = re.search(r"(.*)=(.*)", answer_text) - - if matching_answers is not None: - matching_answers_count += 1 - - if answers_length == 2 and KeywordTrueFound == True and KeywordFalseFound == True: - # ==================== TF confirmed ==================== - logger.debug("Question Type determined: inline_TF") - return 'inline_TF' - - if marked_answers_count == 1 and (self.questiontype_by_user != 'MS' and self.questiontype_by_user != 'MR'): - # ==================== MC confirmed ==================== - logger.debug("Question Type determined: inline_MC") - return 'inline_MC' - - if marked_answers_count > 1 or (self.questiontype_by_user == 'MS' or self.questiontype_by_user == 'MR'): - # ==================== MS confirmed ==================== - logger.debug("Question Type determined: inline_MS") - return 'inline_MS' - - if matching_answers_count == answers_length and matching_answers_count > 1 : - # ==================== MAT confirmed ==================== - logger.debug("Question Type determined: inline_MAT") - return 'inline_MAT' - - if (unmarked_answers_count == 1 and answers_length == 1) or (self.questiontype_by_user == 'WR' or self.questiontype_by_user == 'E'): - # ==================== WR confirmed ==================== - logger.debug("Question Type determined: inline_WR_list") - return 'inline_WR_list' - - if answers_length > 0 and unmarked_answers_count == answers_length: - # ==================== ORD confirmed ==================== - logger.debug("Question Type determined: inline_ORD") - return 'inline_ORD' - logger.debug("Question Type determined: inline_NO_TYPE") - return 'inline_NO_TYPE' - - - - - def __check_endanswer_questiontype(self): - answers_length = len(self.answers) - endanswer_text = self.markdown_to_plain(self.endanswer.answer.lower()) - endanswer_text = self.trim_text(endanswer_text) - - if answers_length > 0: - # possible TF, MC, MS - answer_list = list(map(str.strip, endanswer_text.split(','))) - answer_key_length = len(answer_list) - KeywordTrueFound = False - KeywordFalseFound = False - - for answer in self.answers: - answer_text = self.markdown_to_plain(answer.find('content').text.lower()) - answer_text = self.trim_text(answer_text) - - for choice_answer in answer_list: - correctanswer_index = (ord(choice_answer)-97) - - if correctanswer_index <= (answers_length-1): - # answer index exist - pass - else: - return 'endanswer_NO_TYPE' - - - if answer_text == 'true': - KeywordTrueFound = True - - if answer_text == 'false': - KeywordFalseFound = True - - if answers_length == 2 and KeywordTrueFound == True and KeywordFalseFound == True: - # ==================== TF confirmed ==================== - return 'endanswer_TF' - - if answer_key_length == 1 and (self.questiontype_by_user != 'MS' and self.questiontype_by_user != 'MR'): - # ==================== MC confirmed ==================== - return 'endanswer_MC' - - if (self.questiontype_by_user == 'MS' or self.questiontype_by_user == 'MR') or answer_key_length > 1: - # ==================== MS confirmed ==================== - return 'endanswer_MS' - - else: - # possible FIB, MAT, ORD, WR - matching_answers_count = 0 - is_fib = re.findall(r"\[(.*?)\]", self.questioncontent) - answer_list = list(map(str.strip, endanswer_text.split(';'))) - answer_key_length = len(answer_list) - for answer in answer_list: - matching_answer = re.search(r"(.*)=(.*)", answer) - - if matching_answer is not None: - matching_answers_count += 1 - - if matching_answers_count == answer_key_length and matching_answers_count > 1 : - # ========================= MAT confirmed ======================= - return 'endanswer_MAT' - - if len(is_fib) == answer_key_length: - # ========================= FIB confirmed ======================= - return 'endanswer_FIB' - - if answer_key_length > 1: - # ========================= ORD confirmed ======================= - return 'endanswer_ORD' - - if answer_key_length == 1: - # ========================= WR confirmed ======================= - return 'endanswer_WR' - - return 'endanswer_NO_TYPE' - - - # def build_inline_MC(question, answers, is_random, enumeration): - - # logger.debug("building inline mc") - # question.questiontype = 'MC' - # question.save() - - # mc_object = MultipleChoice.objects.create(question=question) - # if is_random == True: - # mc_object.randomize = True - - # if enumeration: - # mc_object.enumeration = enumeration - # mc_object.save() - # # grab all answers - # for answer_order, answer_item in enumerate(answers): - # mc_answerobject = MultipleChoiceAnswer.objects.create(multiple_choice=mc_object) - # answer_index = trim_text(answer_item.get('answer_prefix')) - # mc_answerobject.index = re.sub(r'[\W_]', '', answer_index) - # mc_answerobject.order = answer_order + 1 - # mc_answerobject.answer = trim_md_to_html(answer_item.get('answer_content')) - # answer_feedback = answer_item.get('feedback') - # is_correct = answer_item.get('correct') - # if answer_feedback != None: - # mc_answerobject.answer_feedback = trim_md_to_html(answer_feedback) - - # if is_correct: - # mc_answerobject.weight = 100 - - # mc_answerobject.save() - - - @staticmethod - def markdown_to_plain(text): - plain_text = pypandoc.convert_text(text, format="markdown_github+fancy_lists+emoji", to="plain", extra_args=['--wrap=none']) - return plain_text - - - @staticmethod - def trim_text(txt): - text = txt.strip() - text = re.sub('', '', text) - text = re.sub('', '\n', text, flags=re.IGNORECASE) - text = text.strip(" \n") - return text - - -class Section: - ''' - main sectioner variables - ''' - title = None - order = None - is_main_content = None - sectionheader = None - sectioncontent = None - - def __init__(self, title=None, - order=None, - is_main_content=None, - sectionheader=None, - sectioncontent=None): - self.title = title - self.order = order - self.is_main_content = is_main_content - self.sectionheader = sectionheader - self.sectioncontent = sectioncontent - - ''' - section variables for processing - ''' - content_from_formatter = None - content_after_images_extracted = None - - -class SectionList: - content = None - sections_list = [] - def __init__(self, content=None): - self.content = content - self.sections_list.clear() - - def run_sectioner(self): - logger.info("sectioner starting") - - content = os.linesep + self.content - - try: - os.chdir('/antlr_build/sectioner') - result = subprocess.run( - 'java -cp sectioner.jar:* sectioner', - shell=True, - input=content.encode("utf-8"), - capture_output=True) - os.chdir('/code') - except: - raise SectionerError("error while reading sections") - - logger.debug("starting sections extraction") - - root = None - try: - root = ET.fromstring(result.stdout.decode("utf-8")) - except: - raise SectionerError("Sectioner results empty") - - # logger.info(ET.tostring(root, encoding='utf8')) - - if len(root) == 0: - raise SectionerError("No Sections found") - - try: - for section in root: - sectionobj = Section() - - sectionobj.order = int(section.attrib.get("id")) + 1 - sectiontitle = section.find('title') - if sectiontitle is not None: - sectionobj.title = sectiontitle.text - - maincontent = section.find('maincontent') - if maincontent is not None: - sectionobj.title = content - sectionobj.is_main_content = True - sectionobj.sectioncontent = maincontent.text - - sectionheader = section.find('sectionheader') - if sectionheader is not None: - sectionobj.is_main_content = False - sectionobj.sectionheader = sectionheader.text - - sectioncontent = section.find('sectioncontent') - if sectioncontent is not None: - sectionobj.is_main_content = False - sectionobj.sectioncontent = sectioncontent.text - - self.sections_list.append(sectionobj) - except: - raise SectionerError("Error extracting section contents") - - return self - - -class QuestionList: - content = None - question_list = [] - - def __init__(self, content=None): - self.content = content - self.question_list.clear() - -class SectionerError(Exception): - def __init__(self, reason, message="Sectioner Error"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - - - -class BaseTextAnswer(): - def __init__(self, answer): - self.answer_prefix = answer['answer_prefix'] - self.MarkedWithStar = answer['correct'] - self.answer_content = answer['answer_content'] - self.feedback = answer['feedback'] - - class EnumeratorTypes(Enum): - LOWERCASELETTERS = 'LOWERCASELETTERS', _('LOWERCASELETTERS') - UPPERCASELETTERS = 'UPPERCASELETTERS', _('UPPERCASELETTERS') - NUMBERS = 'NUMBERS', _('NUMBERS') - ROMAN_NUMERALS = 'ROMAN_NUMERALS', _('ROMAN_NUMERALS') - UPPERCASE_ROMAN_NUMERALS = 'UPPERCASE_ROMAN_NUMERALS', _('UPPERCASE_ROMAN_NUMERALS') - NO_ENUMERATION = 'NO_ENUMERATION', _('NO_ENUMERATION') - - enumerator = EnumeratorTypes.LOWERCASELETTERS - answer_prefix = None - answer_content = None - MarkedWithStar = False - - def __str__(self): - return f"[{self.answer_prefix}][marked*:{ self.MarkedWithStar }][content:{self.answer_content[0:20]}]" - diff --git a/restapi/process/common/extract_images.py b/restapi/process/common/extract_images.py deleted file mode 100644 index 09f7f1e..0000000 --- a/restapi/process/common/extract_images.py +++ /dev/null @@ -1,29 +0,0 @@ -def extract_images(content): - import re - import logging - logger = logging.getLogger(__name__) - - images_list = [] - try: - x = re.findall(r"\", content) - if len(x) == 0: - return content, images_list - for image in x: - images_list.append(image) - - for index, image in enumerate(images_list): - val = re.escape(image) - x = re.sub(val, "<<<<"+ str(index) +">>>>" , content) - content = x - return content, images_list - except Exception as e: - raise ImageExtractError(e) - - -class ImageExtractError(Exception): - def __init__(self, reason, message=""): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' \ No newline at end of file diff --git a/restapi/process/common/process_helper.py b/restapi/process/common/process_helper.py deleted file mode 100644 index 08bf5f8..0000000 --- a/restapi/process/common/process_helper.py +++ /dev/null @@ -1,66 +0,0 @@ -import re -import pypandoc - -def add_info_message(question, info_message): - if question.info: - if info_message not in question.info: - question.info = question.info + "\n" + info_message - question.save() - - else: - question.info = info_message - question.save() - -def add_warning_message(question, warning_message): - if question.warning: - if warning_message not in question.warning: - question.warning = question.warning + "\n" + warning_message - question.save() - - else: - question.warning = warning_message - question.save() - -def add_error_message(obj, error_message): - if obj.error: - if error_message not in obj.error: - obj.error = obj.error + "\n" + error_message - obj.save() - - else: - obj.error = error_message - obj.save() - -def trim_text(txt): - text = txt.strip() - text = re.sub('', '', text) - text = re.sub('', '\n', text, flags=re.IGNORECASE) - text = text.strip(" \n") - return text - -def markdown_to_plain(text): - plain_text = pypandoc.convert_text(text, format="markdown_github+fancy_lists+emoji", to="plain", extra_args=['--wrap=none']) - return plain_text - -def html_to_plain(text): - plain_text = pypandoc.convert_text(text, format="html", to="plain", extra_args=['--wrap=none']) - return plain_text - -def markdown_to_html(text): - html_text = pypandoc.convert_text(text, format="markdown_github+fancy_lists+emoji+task_lists+hard_line_breaks+pipe_tables+all_symbols_escapable+tex_math_dollars", to="html", extra_args=['--mathml', '--ascii']) - str_text = str(html_text) - str_text = re.sub('', lambda x: '
', str_text) - str_text = re.sub('
', lambda x: '', str_text) - str_text = re.sub('', lambda x: '', str_text) - return str_text - -def trim_md_to_plain(text): - text_content = trim_text(text) - text_content = markdown_to_plain(text_content) - return text_content - -def trim_md_to_html(text): - text_content = trim_text(text) - text_content = markdown_to_html(text_content) - text_content = text_content.strip('\n') - return text_content diff --git a/restapi/process/common/restore_images.py b/restapi/process/common/restore_images.py deleted file mode 100644 index f1af146..0000000 --- a/restapi/process/common/restore_images.py +++ /dev/null @@ -1,26 +0,0 @@ -def restore_images(content, images_list): - import re - import logging - logger = logging.getLogger(__name__) - - # This is to conditionally replace every match with the image at the index of the images_list - try: - if content is None: - return None - def replTxt(match): - x = re.search(r"\d+", match.group()) - if int(x.group()) < len(images_list): - return images_list[int(x.group())] - a = re.compile(r"(\<\<\<\<\d+\>\>\>\>)") - result = a.sub(replTxt, content) - return result - except Exception as e: - raise ImageRestoreError(e) - -class ImageRestoreError(Exception): - def __init__(self, reason, message=""): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' \ No newline at end of file diff --git a/restapi/process/endanswers.py b/restapi/process/endanswers.py deleted file mode 100644 index 7ceedd8..0000000 --- a/restapi/process/endanswers.py +++ /dev/null @@ -1,45 +0,0 @@ -import os -import subprocess -import xml.etree.ElementTree as ET -from ..models import EndAnswer -import re - -def get_endanswers(questionlibrary): - if questionlibrary.end_answers_raw == None: - return 0 - os.chdir('/antlr_build/endanswers') - result = subprocess.run( - 'java -cp endanswers.jar:* endanswers', - shell=True, - input=questionlibrary.end_answers_raw.encode("utf-8"), - capture_output=True) - os.chdir('/code') - root = None - try: - root = ET.fromstring(result.stdout.decode("utf-8")) - except: - raise EndAnswerError("Cannot read endanswers") - answers = root.findall("answer") - endanswers_found = 0 - if answers is not None: - for answer in answers: - endanswer = EndAnswer.objects.create(question_library=questionlibrary) - content = answer.find('content').text - index = answer.find('index').text - indexdigit = re.search(r'\d+', index) - endanswer.index = indexdigit.group(0) - endanswer.answer = content - endanswers_found += 1 - endanswer.save() - else: - raise EndAnswerError("No Answers in EndAnswer") - questionlibrary.save() - return endanswers_found - -class EndAnswerError(Exception): - def __init__(self, reason, message="EndAnswer Error"): - self.reason = reason - self.message = message - - def __str__(self): - return f'{self.message} -> {self.reason}' \ No newline at end of file diff --git a/restapi/process/formatter/convert_txt.py b/restapi/process/formatter/convert_txt.py deleted file mode 100644 index cfd0fbc..0000000 --- a/restapi/process/formatter/convert_txt.py +++ /dev/null @@ -1,42 +0,0 @@ -def convert_txt(original_file_path, actual_filename): - import os - import subprocess - import uuid - from pathlib import Path - import glob - import shutil - txt_file_uuid = uuid.uuid4() - txt_lines = "" - - try: - Path("/code/temp").mkdir(parents=True, exist_ok=True) - os.chdir('/code/temp') - - subprocess.run(["soffice", - "--headless", - "--convert-to", - "txt", - "--outdir", - str(txt_file_uuid), - original_file_path], - capture_output=True) - - txt_file_path = glob.glob(f"/code/temp/{str(txt_file_uuid)}/*.txt")[0] - text_file = Path(str(txt_file_path)) - if text_file.is_file(): - f = open(txt_file_path , mode='r', encoding='utf-8-sig') - lines = f.read() - txt_lines = '\n' + lines - f.close() - shutil.rmtree("/code/temp/"+str(txt_file_uuid), ignore_errors=True) - return txt_lines - except Exception as e: - raise ConvertTxtError(e) - - -class ConvertTxtError(Exception): - def __init__(self, reason, message="ConvertTxtError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' \ No newline at end of file diff --git a/restapi/process/formatter/fix_numbering.py b/restapi/process/formatter/fix_numbering.py deleted file mode 100644 index a3fe7eb..0000000 --- a/restapi/process/formatter/fix_numbering.py +++ /dev/null @@ -1,92 +0,0 @@ -def fix_numbering(content_images_tagged, content_txt): - import os - import re - import html - import jaro - import logging - logger = logging.getLogger(__name__) - - try: - #remove empty lines - ref_array = os.linesep.join([s for s in content_txt.splitlines() if s]) - - # make array by splitting lines - ref_array = ref_array.splitlines() - pandoc_array = content_images_tagged.splitlines() - - ref_index = 0 - highest_score = 0 - for pandoc_index, pandoc_ref in enumerate(pandoc_array): - # check if a list item - number_pandoc = re.search(r"^ *([0-9]+)\\?[)|.]", pandoc_ref) - if number_pandoc: - # unescape html characters like ’ etc - pandoc_comp = html.unescape(pandoc_ref) - # remove all non-letter characters - pandoc_comp = re.findall(r'[a-zA-Z0-9]+', pandoc_comp) - pandoc_comp = ''.join(pandoc_comp) - for ref_index_it, ref_element in enumerate(ref_array[ref_index:len(ref_array)], start=ref_index): - # remove all non-letter/number characters - ref_comp = re.findall(r'[a-zA-Z0-9]+', ref_element) - ref_comp = ''.join(ref_comp) - - number_ref = re.search(r"^ *([0-9]+)\\?[)|.]", ref_element) - number_ref_alt = re.search(r"^ *([0-9]+)", ref_element) - - jaro_score = jaro.jaro_metric(ref_comp,pandoc_comp) - - #check if reference is a number and skip if not a number - if not number_ref: - if number_ref_alt: - if jaro_score > 0.9: - error_question = number_pandoc.group(1) - if number_ref_alt: - error_question = number_ref_alt.group(1) - raise QuestionEnumerationError(f'did not match the supported qcon numberlist pattern "." or ") at question: {error_question}') - continue - - ### FOR DEBUGGING specific line - # debug_line = '47' - # if number_pandoc.group(1) == debug_line: - # logger.debug(f"ref_index = {ref_index} ref_index_it = {ref_index_it}") - # logger.debug(f"ref_element = {ref_element}") - # logger.debug(f"ref: {ref_comp[0:120]}") - # logger.debug(f"pandoc: {pandoc_comp[0:120]}") - # logger.debug(f"score: {jaro_score}") - - if jaro_score > 0.9: - # matched by similarity - # if number_ref: - if number_ref.group(1) != number_pandoc.group(1): - logger.debug(f"mismatch found [ref]:[pandoc]-[{number_ref.group(1)}:{number_pandoc.group(1)}]") - subbed = re.sub(r"[0-9]+", number_ref.group(1), pandoc_array[pandoc_index]) - pandoc_array[pandoc_index] = subbed - logger.debug(f"mismatch fixed [ref]:[pandoc]-[{number_ref.group(1)}:{number_pandoc.group(1)}]->[{number_ref.group(1)}:{number_ref.group(1)}]") - ref_index = ref_index_it+1 - break - else: - # number is the same and doesn't need fixing - ref_index = ref_index_it+1 - break - else: - # no match; continue searching - if jaro_score > highest_score: - highest_score = jaro_score - # reached end of ref array without finding a match, comparison strings need to be checked or score needs to be adjusted - if ref_index_it == len(ref_array) - 1: - error_question = number_pandoc.group(1) - logger.warning(f'No reference line found with a high enough similarity score[{highest_score}] for question: {error_question}') - raise QuestionEnumerationError(f'No reference line found with a high enough similarity score[{highest_score}] for question: {error_question}') - - combined_string = '\n'.join(pandoc_array) - return '\n' + combined_string - - except Exception as e: - raise Exception(e) - -class QuestionEnumerationError(Exception): - def __init__(self, reason, message="QuestionEnumerationError"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' \ No newline at end of file diff --git a/restapi/process/formatter/formatter.py b/restapi/process/formatter/formatter.py deleted file mode 100644 index b50c2fb..0000000 --- a/restapi/process/formatter/formatter.py +++ /dev/null @@ -1,126 +0,0 @@ -from ast import Not -import os -import xml.etree.ElementTree as ET -import subprocess -import re - -import logging -logger = logging.getLogger(__name__) - -def run_formatter_parser(content, filename): - root = None - - try: - os.chdir('/antlr_build/formatter') - result = subprocess.run('java -cp formatter.jar:* formatter', - shell=True, - input=content.encode("utf-8"), - capture_output=True) - os.chdir('/code') - root = ET.fromstring(result.stdout.decode("utf-8")) - except: - raise FormatterError("Internal error while converting file") - - logger.debug("starting formatter extraction") - - format = {} - -# # ==================================== MAINCONTENT TITLE - maincontenttitle = root.find('maincontent_title') - logger.debug("checking maincontent title") - if maincontenttitle is not None: - main_title = (maincontenttitle.text).strip() - if main_title: - # format["maincontent_title"] = (trim_text(main_title)).lstrip('# ') - format["maincontent_title"] = main_title - else: - format["maincontent_title"] = None - -# # ==================================== BODY - body = root.find('body') - logger.debug("checking formatter body") - if body is not None: - # questionlibrary.formatter_output = body.text.rstrip() + "\n" - # questionlibrary.save() - format["body"] = body.text.rstrip() + "\n" - else: - raise FormatterError("document body not found") - -# ==================================== END ANSWERS - - end_answers = root.find('end_answers') - logger.debug("checking for endanswers block") - if end_answers is not None: - logger.debug("endanswers block found") - # questionlibrary.end_answers_raw = end_answers.text - # questionlibrary.save() - format["end_answers"] = end_answers.text - else: - logger.info("No endanswers block found") - format["end_answers"] = None - - return format - - - - - - -# def run_formatter_parser(content, filename): -# logger = FilenameLoggingAdapter(newlogger, {'filename': filename}) -# root = None - -# try: -# os.chdir('/antlr_build/formatter') -# result = subprocess.run('java -cp formatter.jar:* formatter', -# shell=True, -# input=content.encode("utf-8"), -# capture_output=True) -# os.chdir('/code') -# root = ET.fromstring(result.stdout.decode("utf-8")) -# except: -# raise FormatterError("Internal error while converting file") - -# logger.debug("starting formatter extraction") - -# # format = Format() -# # ==================================== SECTION INFO - -# maincontenttitle = root.find('maincontent_title') -# logger.debug("checking maincontent title") -# if maincontenttitle is not None: -# main_title = (maincontenttitle.text).strip() -# if main_title: -# format.maincontent_title = (trim_text(main_title)).lstrip('# ') -# else: -# format.maincontent_title = None -# # ==================================== BODY - -# body = root.find('body') -# if body is not None: -# # questionlibrary.formatter_output = body.text.rstrip() + "\n" -# # questionlibrary.save() -# format.body = body.text.rstrip() + "\n" -# else: -# raise FormatterError("document body not found") - -# # ==================================== END ANSWERS - -# end_answers = root.find('end_answers') -# logger.debug("checking for endanswers block") -# if end_answers is not None: -# logger.debug("endanswers block found") -# # questionlibrary.end_answers_raw = end_answers.text -# # questionlibrary.save() -# format.end_answers = end_answers.text -# else: -# logger.info("No endanswers block found") - -# return format - -class FormatterError(Exception): - def __init__(self, reason, message="Formatter Error"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' diff --git a/restapi/process/questionparser/questionparser.py b/restapi/process/questionparser/questionparser.py deleted file mode 100644 index 29fde05..0000000 --- a/restapi/process/questionparser/questionparser.py +++ /dev/null @@ -1,141 +0,0 @@ -from ast import Not -import os -import xml.etree.ElementTree as ET -import subprocess -import re -# from ...models import Question - -import logging -logger = logging.getLogger(__name__) - -from enum import Enum -from django.utils.translation import gettext_lazy as _ - -def run_questionparser(question): - - question.get_line_elements() - question.extract_question_header_elements() - question.get_question_body_parts_list() - question.get_number_provided() - question.separate_question_and_answers() - - # question.check_questiontype() - # question.compare_user_type_with_processed_type() - - # question.build_question() - # logger.info(dir(question)) - - # q = Question() - # m = MultipleChoice() - # m.enumeration = "letters" - # m.answers = [MultipleChoiceAnswer(1,1,"answer1"), - # MultipleChoiceAnswer(2,2,"answer two")] - # q.processedquestion = m - - # b = BaseTextAnswer() - # b.answer_content = "hallo ik ben base van base van g;kwjefn;ewlrkfm;owemkl" - - # manswer = MultipleChoiceAnswer(basetextanswer=b) - # print(manswer) - # print(question.basetextanswers) - print(help(question)) - - - return question - - - -# class MultipleChoice(): -# randomize = None -# enumeration = None -# answers = [] - - - -# class BaseTextAnswer(): -# def __init__(self, answerlistitem): -# self.answer_prefix = answerlistitem['answer_prefix'] -# self.MarkedWithStar = MarkedWithStar -# self.answer_content = answer_content -# self.feedback = feedback - -# class EnumeratorTypes(Enum): -# LOWERCASELETTERS = 'LOWERCASELETTERS', _('LOWERCASELETTERS') -# UPPERCASELETTERS = 'UPPERCASELETTERS', _('UPPERCASELETTERS') -# NUMBERS = 'NUMBERS', _('NUMBERS') -# ROMAN_NUMERALS = 'ROMAN_NUMERALS', _('ROMAN_NUMERALS') -# UPPERCASE_ROMAN_NUMERALS = 'UPPERCASE_ROMAN_NUMERALS', _('UPPERCASE_ROMAN_NUMERALS') -# NO_ENUMERATION = 'NO_ENUMERATION', _('NO_ENUMERATION') - -# enumerator = EnumeratorTypes.LOWERCASELETTERS -# enumindex = None -# answer_content = None -# MarkedWithStar = False - -# def __str__(self): -# return f"[{self.enumindex}][marked*:{ self.MarkedWithStar }][content:{self.answer_content[0:20]}]" - -# class MultipleChoiceAnswer(BaseTextAnswer): -# def __init__(self, basetextanswer=None, index=None, order=None): -# self.index = index -# self.order=order -# # super(MultipleChoiceAnswer, self).__init__() -# if type(basetextanswer) is BaseTextAnswer: -# super().__init__(basetextanswer) -# # self.answer=answer -# index = None -# order = None -# # answer = None -# answer_feedback = None -# weight = None -# def __str__(self): -# return f"[{self.index}][marked*:{ self.MarkedWithStar }][content:]" - - -# class TrueFalse(): -# true_weight = None -# true_feedback = None -# false_weight = None -# false_feedback = None -# enumeration = None - -# class Fib(): -# type = None -# text = None -# order = None -# size = None -# weight = None - -# class MultipleSelect(): -# randomize = None -# enumeration = None -# style = None -# grading_type = None - -# class MultipleSelectAnswer(): -# index = None -# order = None -# answer = None -# answer_feedback = None -# is_correct = None - -# class Matching(): -# grading_type = None - - -# class MatchingChoice(): -# choice_text = None - -# class MatchingAnswer(): -# answer_text = None - -# class Ordering(): -# text = None -# order = None -# ord_feedback = None - -# class WrittenResponse(): -# enable_student_editor = None -# initial_text = None -# answer_key = None -# enable_attachments = None \ No newline at end of file diff --git a/restapi/process/sectioner/sectioner.py b/restapi/process/sectioner/sectioner.py deleted file mode 100644 index 265eae9..0000000 --- a/restapi/process/sectioner/sectioner.py +++ /dev/null @@ -1,82 +0,0 @@ -import os -import subprocess -import xml.etree.ElementTree as ET -# from .process_helper import markdown_to_plain, trim_text, markdown_to_html -# from api.tasks import markdown_to_plain, trim_text, markdown_to_html -from ...models import Section - -import logging -newlogger = logging.getLogger(__name__) -from api.logging.logging_adapter import FilenameLoggingAdapter - -# This is to split sections into separate objects -def run_sectioner(sectionlist): - logger = FilenameLoggingAdapter(newlogger, { - 'filename': "" - }) - logger.info("sectioner starting") - - content = os.linesep + sectionlist.content - - try: - os.chdir('/antlr_build/sectioner') - result = subprocess.run( - 'java -cp sectioner.jar:* sectioner', - shell=True, - input=content.encode("utf-8"), - capture_output=True) - os.chdir('/code') - except: - raise SectionerError("error while reading sections") - - logger.debug("starting sections extraction") - - root = None - try: - root = ET.fromstring(result.stdout.decode("utf-8")) - except: - raise SectionerError("Sectioner results empty") - - # logger.info(ET.tostring(root, encoding='utf8')) - - if len(root) == 0: - raise SectionerError("No Sections found") - - try: - for section in root: - sectionobj = Section() - - sectionobj.order = int(section.attrib.get("id")) + 1 - sectiontitle = section.find('title') - if sectiontitle is not None: - sectionobj.title = sectiontitle.text - - maincontent = section.find('maincontent') - if maincontent is not None: - sectionobj.title = content - sectionobj.is_main_content = True - sectionobj.sectioncontent = maincontent.text - - sectionheader = section.find('sectionheader') - if sectionheader is not None: - sectionobj.is_main_content = False - sectionobj.sectionheader = sectionheader.text - - sectioncontent = section.find('sectioncontent') - if sectioncontent is not None: - sectionobj.is_main_content = False - sectionobj.sectioncontent = sectioncontent.text - - sectionlist.sections_list.append(sectionobj) - except: - raise SectionerError("Error extracting section contents") - - return sectionlist - - -class SectionerError(Exception): - def __init__(self, reason, message="Sectioner Error"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' \ No newline at end of file diff --git a/restapi/process/splitter/splitter.py b/restapi/process/splitter/splitter.py deleted file mode 100644 index 2aaf64f..0000000 --- a/restapi/process/splitter/splitter.py +++ /dev/null @@ -1,162 +0,0 @@ -import os -import subprocess -import xml.etree.ElementTree as ET -# from api.tasks import trim_text -import logging -logger = logging.getLogger(__name__) - -from ...models import QuestionList -from ...models import BaseQuestion -# from ...models import Question - - -import re -import os - -class Splitter(QuestionList): - def __init__(self, content) -> None: - super().__init__(content=content) - self.total_questions_found = 0 - self.current_section_starts_with_1 = False - - def add_newlines_before_question(self): - lines_altered = [] - lines_original = self.content.splitlines() - # logger.debug("raw_content") - # logger.debug(section.raw_content) - # logger.debug("lines original") - # logger.debug(lines_original) - - # check if the first question was found already - number_1_found = False - for line in lines_original: - number_prefix = re.search(r"^ *(\d+)[\\]{0,2}[.|)]", line) - if number_prefix: - numbered_line = int(number_prefix.group(1)) - if numbered_line != 1: - #this section doesn't start with 1 so we dont need to check for it further - number_1_found = True - self.current_section_starts_with_1 = False - break - else: - number_1_found = False - self.current_section_starts_with_1 = True - break - tracklist = 0 - newline_detected = False - # letterlist_enumvalue = '' - for line in lines_original: - # check if newlines are detected.(newlines cancel lists) - if '' in line: - #means newline is in this line so it canceled the previous list tracking - # reset list back to zero - newline_detected = True - tracklist = 0 - if number_1_found: - #check if the current line is a numbered line - number_prefix = re.search(r"^ *(\d+)[\\]{0,2}[.|)]", line) - if number_prefix: - numbered_line = int(number_prefix.group(1)) - #it is a numbered line, so check if it is a #1 - if numbered_line == 1: - # starting a new numbered list - tracklist = 1 - newline_detected = False # reset to allow new list to be tracked - else: - # check if we were in a list on the previous numbered line - if tracklist == 0: - # we were not a list on the previous numbered line - lines_altered.append('\n') - else: - # we were in a list on the previous line - # check if we still are on a list on this line - if numbered_line == tracklist+1: - # this means we might still be inside a list. - # to make sure lets see if a newline was detected prior to this line - if newline_detected: - # there was a newline detected so this means the list is cancelled - # reset the list tracker to zero - tracklist = 0 - # and because the list was cancelled we can assume this line to be a new question - lines_altered.append('\n') - # reset the newline_detected to False - newline_detected = False - else: - #update tracklist to track the current list further - tracklist = numbered_line - # TODO WARN USER ABOUT POTENTIAL NEWLINE NEEDED HERE?? But we don't know the criteria to detect this issue yet. more development needed here - else: - # this means we have exited the list, and is safe to assume this is a new question - lines_altered.append('\n') - tracklist = 0 - else: - # look for first question - if re.search(r"^ *1[\\]{0,2}[.|)]", line): - number_1_found = True - lines_altered.append(line) - result = os.linesep.join(lines_altered) - result = os.linesep + result - self.content = result - return self - - - def split_questions(self): - root = None - try: - os.chdir('/antlr_build/splitter') - result = subprocess.run( - 'java -cp splitter.jar:* splitter', - shell=True, - input=self.content.encode("utf-8"), - capture_output=True) - os.chdir('/code') - root = ET.fromstring(result.stdout.decode("utf-8")) - except Exception as e: - raise SplitterError("ANTLR: " + str(e)) - - # COPY contents of first element into the second element because this sections does not start with number 1. - # meaning that the contents of the first element belongs - # to the first question in this section - if not self.current_section_starts_with_1: - if len(root) > 1: - root[1][0].text = str(root[0][0].text) + str(root[1][0].text) - root.remove(root[0]) - #renumber the question id because the first element was removed after being copied to the second element - id = 0 - for question in root: - question.attrib["id"] = str(id) - id += 1 - - try: - for index, question in enumerate(root, start=1): - questionobj = BaseQuestion() - questionobj.index = index - questioncontent = question.find('questioncontent') - if questioncontent is not None: - questionobj.questioncontent = questioncontent.text - self.question_list.append(questionobj) - except: - # sectionobject.error = "Failed to process questions in section" - raise SplitterError("Failed to process questions in section") - # return self.questionlist - - - -class SplitterError(Exception): - def __init__(self, reason, message="Splitter Error"): - self.reason = reason - self.message = message - def __str__(self): - return f'{self.message} -> {self.reason}' - - - - - - - - - - - - diff --git a/restapi/serializers.py b/restapi/serializers.py deleted file mode 100644 index 0eadcf6..0000000 --- a/restapi/serializers.py +++ /dev/null @@ -1,58 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. - -from rest_framework import serializers -from django.conf import settings - -class FormatSerializer(serializers.Serializer): - filename = serializers.CharField(required=False) - maincontent_title = serializers.CharField(required=False) - body = serializers.CharField() - end_answers = serializers.CharField(required=False) - - -class SectionSerializer(serializers.Serializer): - title = serializers.CharField(required=False) - order = serializers.IntegerField(max_value=None, min_value=None, required=False) - is_main_content = serializers.BooleanField(required=False) - sectionheader = serializers.CharField(allow_null=True, required=False) - sectioncontent = serializers.CharField() - -class SectionListSerializer(serializers.Serializer): - sections_list = SectionSerializer(many=True, required=False, allow_null=True) - -class QuestionBaseSerializer(serializers.Serializer): - index = serializers.IntegerField(max_value=None, min_value=None, required=False) - questioncontent = serializers.CharField(required=False) -class QuestionListSerializer(serializers.Serializer): - question_list = QuestionBaseSerializer(many=True) - -class ProcessedQuestionSerializer(serializers.Serializer): - randomize = serializers.CharField(required=False) - enumeration = serializers.CharField(required=False) -# answers = serializers.ListField( -# child=serializers.IntegerField(min_value=0, max_value=100) -# ) - -class BasetextAnswerSerializer(serializers.Serializer): - enumindex = serializers.CharField(required=False) - answer_content = serializers.CharField(required=False) - -class BaseTextAnswerField(serializers.Field): - def to_representation(self, value): - return f"{value.answer_prefix}" \ - , f"{value.MarkedWithStar}" \ - , f"{value.answer_content}" - -class QuestionSerializer(serializers.Serializer): - number_provided = serializers.CharField(required=False) - question_header_type = serializers.CharField(required=False) - question_header_title = serializers.CharField(required=False) - question_header_points = serializers.CharField(required=False) - questiontype_processed = serializers.CharField(required=False) - questioncontent = serializers.CharField(required=False) - basetextanswers = serializers.ListField(required=False, - child=BaseTextAnswerField(required=False)) - # answers = serializers.CharField(required=False) - # processedquestion = ProcessedQuestionSerializer(required=False) diff --git a/restapi/tasks.py b/restapi/tasks.py deleted file mode 100644 index 09a6545..0000000 --- a/restapi/tasks.py +++ /dev/null @@ -1,64 +0,0 @@ -from celery import shared_task -from celery.utils.log import get_task_logger - -loggercelery = get_task_logger(__name__) -import re - -from .logging.logging_adapter import FilenameLoggingAdapter -from .logging.ErrorTypes import (WRInlineStructureError, WREndStructureError, MSInlineStructureError, MSEndStructureError, ORDInlineStructureError, ORDEndStructureError, MCInlineStructureError, MCEndStructureError, TFInlineStructureError, TFEndStructureError, FIBInlineStructureError, FIBEndStructureError, MATInlineStructureError, MATEndStructureError, InlineNoTypeError, EndAnswerNoTypeError, NoTypeDeterminedError, MarkDownConversionError) -from .logging.WarningTypes import (RespondusTypeEWarning, RespondusTypeMRWarning, RespondusTypeFMBWarning, RespondusTypeMTWarning) - -@shared_task() -def run_pandoc_task(temp_file_path, filename): - logger = FilenameLoggingAdapter(loggercelery, { - 'filename': filename - }) - - try: - import pypandoc - mdblockquotePath = "./pandoc/pandoc-filters/mdblockquote.lua" - emptyparaPath = "./pandoc/pandoc-filters/emptypara.lua" - imageFilterPath = "./pandoc/pandoc-filters/image.lua" - tables = "./pandoc/pandoc-filters/tables.lua" - linebreakPath = "./pandoc/pandoc-filters/linebreak.lua" - # listsPath = "./api/pandoc/pandoc-filters/lists.lua" - - pandoc_word_to_html = pypandoc.convert_file( - temp_file_path, - format='docx+empty_paragraphs', - to='html+empty_paragraphs+tex_math_single_backslash', - extra_args=['--no-highlight', - '--embed-resources', - '--markdown-headings=atx', - '--preserve-tabs', - '--wrap=preserve', - '--indent=false', - '--mathml', - '--ascii', - # '--lua-filter=' + imageFilterPath - ]) - pandoc_word_to_html = re.sub(r"(?!\s)", " ", pandoc_word_to_html) - pandoc_word_to_html = re.sub(r"(?!\s)", " ", pandoc_word_to_html) - pandoc_html_to_md = pypandoc.convert_text( - pandoc_word_to_html, - 'markdown_github+fancy_lists+emoji+hard_line_breaks+all_symbols_escapable+escaped_line_breaks+pipe_tables+startnum+tex_math_dollars', - format='html+empty_paragraphs', - extra_args=['--no-highlight', - '--embed-resources', - '--markdown-headings=atx', - '--preserve-tabs', - '--wrap=preserve', - '--indent=false', - '--mathml', - '--ascii', - '--lua-filter=' + mdblockquotePath, - '--lua-filter=' + emptyparaPath, - '--lua-filter=' + linebreakPath, - # '--lua-filter=' + tables - ]) - pandoc_html_to_md = pandoc_html_to_md.rstrip() - return "\n" + pandoc_html_to_md + "\n" - except Exception as e: - logger.debug(e) - raise MarkDownConversionError(e) - diff --git a/restapi/tests.py b/restapi/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/restapi/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/restapi/urls.py b/restapi/urls.py deleted file mode 100644 index cb6d5db..0000000 --- a/restapi/urls.py +++ /dev/null @@ -1,15 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this -# file, You can obtain one at https://mozilla.org/MPL/2.0/. - -from django.urls import include, path, re_path -from . import views -from django.conf import settings - -urlpatterns = [ - path('format', views.format), - path('sections', views.sections), - path('splitter', views.splitter), - path('parsequestion', views.parsequestion) -] - diff --git a/restapi/views.py b/restapi/views.py deleted file mode 100644 index ed0da75..0000000 --- a/restapi/views.py +++ /dev/null @@ -1,131 +0,0 @@ -from rest_framework.decorators import api_view, permission_classes, authentication_classes, parser_classes -from rest_framework.parsers import JSONParser -from rest_framework.response import Response -from rest_framework import authentication, permissions -from rest_framework.views import APIView -from rest_framework.permissions import AllowAny -from django.conf import settings - -from .serializers import FormatSerializer -from .serializers import SectionListSerializer, SectionSerializer -from .serializers import QuestionListSerializer, QuestionBaseSerializer, QuestionSerializer -from .models import Format -from .models import SectionList, QuestionList -from .models import BaseQuestion, BaseTextAnswer -from .process.common.extract_images import extract_images -from .process.common.restore_images import restore_images -# from .process.sectioner.sectioner import run_sectioner -# from .process.splitter.splitter import run_splitter -from .process.splitter.splitter import Splitter -from .process.questionparser.questionparser import run_questionparser - -import logging -logger = logging.getLogger(__name__) - -@authentication_classes([]) -@permission_classes([]) -@api_view(['POST']) -def format(request): - maincontent_title = request.data['file'].name.split(".")[0] - filename = request.data['file'].name - temp_file_path = request.data['file'].temporary_file_path() - temp_file_name = request.data['file'].name - - format = Format(temp_file_path, temp_file_name, filename, maincontent_title) - format.convert_pandoc().extract_images().convert_txt().fix_numbering().run_formatter().restore_images() - serializer = FormatSerializer(format) - return Response(serializer.data, status=200) - -@parser_classes([JSONParser]) -@authentication_classes([]) -@permission_classes([]) -@api_view(['POST']) -def sections(request): - serializer = FormatSerializer(data=request.data) - if serializer.is_valid(raise_exception=True): - - sectionlist = SectionList( - content = serializer.validated_data['body']) - - sectionlist.content, images_list = extract_images( - sectionlist.content) - - sectionlist.run_sectioner() - - for section in sectionlist.sections_list: - sectionheader = restore_images(section.sectionheader, - images_list) - setattr(section, 'sectionheader', sectionheader) - sectioncontent = restore_images(section.sectioncontent, - images_list) - setattr(section, 'sectioncontent', sectioncontent) - - serializer = SectionListSerializer(sectionlist) - - return Response(serializer.data, status=200) - return Response(serializer.errors, status=400) - -@parser_classes([JSONParser]) -@authentication_classes([]) -@permission_classes([]) -@api_view(['POST']) -def splitter(request): - serializer = SectionSerializer(data=request.data) - if serializer.is_valid(raise_exception=True): - questionlist = QuestionList( - content=serializer.validated_data['sectioncontent'], - ) - - questionlist.content, images_list = extract_images( - questionlist.content) - - splitter = Splitter(questionlist.content) - splitter.add_newlines_before_question().split_questions() - questionlist = super(splitter.__class__,splitter) - for question in questionlist.question_list: - questioncontent = restore_images(question.questioncontent, - images_list) - setattr(question, 'questioncontent', questioncontent) - serializer = QuestionListSerializer(questionlist) - return Response(serializer.data, status=200) - return Response(serializer.errors, status=400) - -@parser_classes([JSONParser]) -@authentication_classes([]) -@permission_classes([]) -@api_view(['POST']) -def parsequestion(request): - serializer = QuestionSerializer(data=request.data) - if serializer.is_valid(raise_exception=True): - basequestion = BaseQuestion( - questioncontent=serializer.validated_data['questioncontent'], - ) - basequestion.questioncontent, images_list = extract_images( - basequestion.questioncontent) - - basequestion.get_line_elements() - basequestion.extract_question_header_elements() - basequestion.get_question_body_parts_list() - basequestion.get_number_provided() - basequestion.separate_question_and_answers() - basequestion.check_questiontype() - basequestion.compare_user_type_with_processed_type() - - serializernew = QuestionSerializer(basequestion) - return Response(serializernew.data, status=200) - return Response(serializer.errors, status=400) - - -@parser_classes([JSONParser]) -@authentication_classes([]) -@permission_classes([]) -@api_view(['POST']) -def endanswer(request): - - return Response("endanswer") - -class RootPath(APIView): - permission_classes = [AllowAny] - - def get(self, request, format=None): - return Response(settings.APP_VERSION, status=200) \ No newline at end of file