Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 88 additions & 2 deletions Sources/Parser/Preprocessor/GMarkPreprocessor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,25 @@ public class LaTeXPreprocessor: GMarkPreprocessorProtocol {
private func processLaTeX(_ markdown: String) -> String {
var result = markdown

// LaTeX pattern: $$...$$, $...$, \[...\], \(...\)
let pattern = "\\$\\$([\\s\\S]*?)\\$\\$|\\$([\\s\\S]*?)\\$|\\\\\\[([\\s\\S]*?)\\\\\\]|\\\\\\(([\\s\\S]*?)\\\\\\)"
// Process in two passes:
// First pass: Match display math ($$...$$) and bracket notation
// Second pass: Match inline math ($...$) but not where it conflicts with $$

// Pass 1: Display math with $$, \[...\], \(...\)
let displayPattern = "\\$\\$([\\s\\S]*?)\\$\\$|\\\\\\[([\\s\\S]*?)\\\\\\]|\\\\\\(([\\s\\S]*?)\\\\\\)"
result = processPattern(displayPattern, in: result, alwaysAccept: true)

// Pass 2: Inline math with single $, but use markers to avoid conflicts
// We use negative lookbehind and lookahead to ensure $ is not preceded or followed by $
// The pattern [^$\n]+? ensures content doesn't contain $ or newlines
let inlinePattern = "(?<!\\$)\\$(?!\\$)([^$\\n]+?)\\$(?!\\$)"
result = processPattern(inlinePattern, in: result, alwaysAccept: false)

return result
}

private func processPattern(_ pattern: String, in markdown: String, alwaysAccept: Bool) -> String {
var result = markdown

guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else {
return result
Expand All @@ -77,13 +94,82 @@ public class LaTeXPreprocessor: GMarkPreprocessorProtocol {
// Skip if content is too large (potential security issue)
guard matchedString.count < 3000 else { continue }

// Skip if already wrapped (from previous pass)
if matchedString.contains("<LaTex>") {
continue
}

// For display math, always accept. For inline math, validate
guard alwaysAccept || isValidLaTeX(matchedString) else { continue }

let wrappedString = wrapLaTeX(matchedString)
result = (result as NSString).replacingCharacters(in: matchRange, with: wrappedString)
}

return result
}

private func isValidLaTeX(_ content: String) -> Bool {
// For inline math ($...$), apply validation
// The content here should already have delimiters
let innerContent: Substring
if content.hasPrefix("$") && content.hasSuffix("$") {
innerContent = content.dropFirst().dropLast()
} else {
// For other formats, just use the content as-is
return true
}

// If empty or just whitespace, not valid LaTeX
guard !innerContent.trimmingCharacters(in: .whitespaces).isEmpty else {
return false
}

// If content contains table cell delimiters (|), it's very unlikely to be valid inline LaTeX
if innerContent.contains("|") {
return false
}

// Currency pattern: Just digits possibly with decimal point, currency symbols, or simple separators
// Examples to reject: $20/月, $10, $5.99, $1,000
// Note: \u4e00-\u9fff represents CJK (Chinese, Japanese, Korean) Unified Ideographs
let currencyPattern = "^\\s*\\d+([.,]\\d+)?\\s*[/\\-a-zA-Z\\u4e00-\\u9fff]*\\s*$"
if let currencyRegex = try? NSRegularExpression(pattern: currencyPattern, options: []),
currencyRegex.firstMatch(in: String(innerContent), options: [], range: NSRange(location: 0, length: innerContent.utf16.count)) != nil {
return false
}

// Valid LaTeX should contain at least one of these indicators:
// - Backslash (LaTeX commands like \alpha, \frac, etc.)
// - Superscript/subscript (^, _)
// - Common math operators in context (+, -, *, =, <, > when with letters)
// - Greek letters or special symbols
// - Parentheses/brackets with operators
let hasBackslash = innerContent.contains("\\")
let hasSuperSubScript = innerContent.contains("^") || innerContent.contains("_")
let hasLetters = innerContent.rangeOfCharacter(from: CharacterSet.letters) != nil

// If it has backslash (LaTeX command) or super/subscript, it's likely LaTeX
if hasBackslash || hasSuperSubScript {
return true
}

// If it has letters and is not just a simple number/currency, it might be LaTeX
// This catches expressions like "x + y", "a = b", etc.
if hasLetters {
// Check if it contains math-like patterns
// Note: - is placed at the end of character class to avoid escaping issues
let mathPattern = "[a-zA-Z]\\s*[+*/=<>-]|[+*/=<>-]\\s*[a-zA-Z]|[a-zA-Z]\\s*\\^|\\^\\s*[a-zA-Z]"
if let mathRegex = try? NSRegularExpression(pattern: mathPattern, options: []),
mathRegex.firstMatch(in: String(innerContent), options: [], range: NSRange(location: 0, length: innerContent.utf16.count)) != nil {
return true
}
}

// Default to false for safety - don't treat as LaTeX unless we're confident
return false
}

private func wrapLaTeX(_ content: String) -> String {
let lines = content.components(separatedBy: .newlines)

Expand Down
53 changes: 53 additions & 0 deletions Tests/GMarkdownTests/GMarkdownTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,57 @@ final class GMarkdownTests: XCTestCase {
// Defining Test Cases and Test Methods
// https://developer.apple.com/documentation/xctest/defining_test_cases_and_test_methods
}

// MARK: - LaTeX Preprocessor Tests

func testLaTeXPreprocessorDoesNotMatchCurrencySymbols() throws {
let preprocessor = LaTeXPreprocessor()

// Test case 1: $20/月 should not be treated as LaTeX
let input1 = "$20/月"
let result1 = preprocessor.process(input1)
XCTAssertEqual(result1, input1, "Currency symbols like $20/月 should not be treated as LaTeX")

// Test case 2: 最低$10/月 should not be treated as LaTeX
let input2 = "最低$10/月"
let result2 = preprocessor.process(input2)
XCTAssertEqual(result2, input2, "Currency symbols in text like 最低$10/月 should not be treated as LaTeX")

// Test case 3: Multiple currency symbols in a sentence
let input3 = "价格从$10到$20不等"
let result3 = preprocessor.process(input3)
XCTAssertEqual(result3, input3, "Multiple currency symbols should not be treated as LaTeX")
}

func testLaTeXPreprocessorMatchesActualLaTeX() throws {
let preprocessor = LaTeXPreprocessor()

// Test case 1: Inline LaTeX with actual math expression
let input1 = "The formula is $x^2 + y^2 = z^2$ in the text"
let result1 = preprocessor.process(input1)
XCTAssertTrue(result1.contains("<LaTex>"), "Actual LaTeX expressions should be wrapped")
XCTAssertTrue(result1.contains("x^2 + y^2 = z^2"), "LaTeX content should be preserved")

// Test case 2: Display LaTeX with $$
let input2 = "Display formula: $$\\int_0^1 x^2 dx$$"
let result2 = preprocessor.process(input2)
XCTAssertTrue(result2.contains("<LaTex>"), "Display LaTeX with $$ should be wrapped")

// Test case 3: LaTeX with brackets
let input3 = "Formula: \\[x + y = z\\]"
let result3 = preprocessor.process(input3)
XCTAssertTrue(result3.contains("<LaTex>"), "LaTeX with \\[...\\] should be wrapped")
}

func testLaTeXPreprocessorMixedContent() throws {
let preprocessor = LaTeXPreprocessor()

// Test mixed content from the issue
let input = "| **收费模式** | 免费基础版 / Plus订阅 ($20/月) | **完全免费** (Web端) / API超低价 | **完全免费** | 必须付费订阅 (最低$10/月) |"
let result = preprocessor.process(input)

// Currency symbols should remain unchanged
XCTAssertTrue(result.contains("$20/月"), "Currency $20/月 should not be modified")
XCTAssertTrue(result.contains("$10/月"), "Currency $10/月 should not be modified")
}
}