diff --git a/internal/cli/common/html.go b/internal/cli/common/html.go
index d64f338..f3bf627 100644
--- a/internal/cli/common/html.go
+++ b/internal/cli/common/html.go
@@ -13,6 +13,8 @@ func StripHTML(s string) string {
s = RemoveTagWithContent(s, "head")
// Replace block-level elements with newlines before stripping tags
+ // Note: table cell elements (table, td, th, tbody, thead, tfoot) are NOT included
+ // because they're typically used for layout; tr is included to separate rows
blockTags := []string{"br", "p", "div", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6"}
for _, tag := range blockTags {
// Handle
,
,
@@ -52,17 +54,25 @@ func StripHTML(s string) string {
text = strings.ReplaceAll(text, " ", " ")
}
- // Collapse multiple newlines
- for strings.Contains(text, "\n\n\n") {
- text = strings.ReplaceAll(text, "\n\n\n", "\n\n")
- }
-
- // Trim spaces from each line
+ // Trim spaces from each line first
lines := strings.Split(text, "\n")
for i, line := range lines {
lines[i] = strings.TrimSpace(line)
}
- text = strings.Join(lines, "\n")
+
+ // Remove consecutive empty lines, keeping at most one blank line
+ var cleanedLines []string
+ prevEmpty := false
+ for _, line := range lines {
+ isEmpty := line == ""
+ if isEmpty && prevEmpty {
+ continue // Skip consecutive empty lines
+ }
+ cleanedLines = append(cleanedLines, line)
+ prevEmpty = isEmpty
+ }
+
+ text = strings.Join(cleanedLines, "\n")
// Remove leading/trailing empty lines
return strings.TrimSpace(text)