From 976445e0483b2769e30028545209132eea5be4b7 Mon Sep 17 00:00:00 2001 From: Josiah Campbell <9521010+jocmp@users.noreply.github.com> Date: Sat, 4 Apr 2026 11:40:36 -0400 Subject: [PATCH] Skip duplicate items based on title + content hash --- .../feedbin/FeedbinAccountDelegate.kt | 1 + .../accounts/local/LocalAccountDelegate.kt | 95 +++++++++++-------- .../miniflux/MinifluxAccountDelegate.kt | 1 + .../accounts/reader/ReaderAccountDelegate.kt | 1 + .../main/java/com/jocmp/capy/common/MD5.kt | 12 +++ .../jocmp/capy/persistence/ArticleMapper.kt | 2 + .../capy/db/24_AddContentHashToArticles.sqm | 3 + .../sqldelight/com/jocmp/capy/db/articles.sq | 14 ++- .../com/jocmp/capy/fixtures/ArticleFixture.kt | 3 +- .../capy/persistence/ArticleMapperTest.kt | 1 + 10 files changed, 92 insertions(+), 41 deletions(-) create mode 100644 capy/src/main/java/com/jocmp/capy/common/MD5.kt create mode 100644 capy/src/main/sqldelight/com/jocmp/capy/db/24_AddContentHashToArticles.sqm diff --git a/capy/src/main/java/com/jocmp/capy/accounts/feedbin/FeedbinAccountDelegate.kt b/capy/src/main/java/com/jocmp/capy/accounts/feedbin/FeedbinAccountDelegate.kt index 949cb7f0b..8f5f1186f 100644 --- a/capy/src/main/java/com/jocmp/capy/accounts/feedbin/FeedbinAccountDelegate.kt +++ b/capy/src/main/java/com/jocmp/capy/accounts/feedbin/FeedbinAccountDelegate.kt @@ -466,6 +466,7 @@ internal class FeedbinAccountDelegate( image_url = entry.images?.size_1?.cdn_url, published_at = entry.published.toDateTime?.toEpochSecond(), enclosure_type = enclosureType, + content_hash = null, ) articleRecords.createStatus( diff --git a/capy/src/main/java/com/jocmp/capy/accounts/local/LocalAccountDelegate.kt b/capy/src/main/java/com/jocmp/capy/accounts/local/LocalAccountDelegate.kt index 3f2e692b9..32a3a4660 100644 --- a/capy/src/main/java/com/jocmp/capy/accounts/local/LocalAccountDelegate.kt +++ b/capy/src/main/java/com/jocmp/capy/accounts/local/LocalAccountDelegate.kt @@ -7,6 +7,7 @@ import com.jocmp.capy.Feed import com.jocmp.capy.accounts.AddFeedResult import com.jocmp.capy.accounts.FeedOption import com.jocmp.capy.common.ContentFormatter +import com.jocmp.capy.common.MD5 import com.jocmp.capy.common.TimeHelpers.nowUTC import com.jocmp.capy.common.TimeHelpers.published import com.jocmp.capy.common.transactionWithErrorHandling @@ -239,49 +240,62 @@ internal class LocalAccountDelegate( ) { val filters = preferences.filterKeywords.get() + val parsedItems = items.mapNotNull { item -> + val publishedAt = published(item.pubDate, fallback = updatedAt).toEpochSecond() + val parsedItem = ParsedItem(item, siteURL = feed.siteURL) + val withinCutoff = cutoffDate == null || publishedAt > cutoffDate.toEpochSecond() + val blocked = containsFilteredText(parsedItem, filters) + + if (parsedItem.id != null && withinCutoff && !blocked) { + val contentHash = MD5.from(parsedItem.title + parsedItem.contentHTML.orEmpty()) + ParsedArticle(parsedItem, item, publishedAt, contentHash) + } else { + null + } + } + + val contentHashes = parsedItems.map { it.contentHash } + val existingHashes = database.articlesQueries + .findExistingHashes(feedID = feed.id, contentHashes = contentHashes) + .executeAsList() + .mapNotNull { it.content_hash } + .toSet() + + val newItems = parsedItems.filter { it.contentHash !in existingHashes } + database.transactionWithErrorHandling { - items.forEach { item -> - val publishedAt = published(item.pubDate, fallback = updatedAt).toEpochSecond() - val parsedItem = ParsedItem( - item, - siteURL = feed.siteURL + newItems.forEach { (parsedItem, item, publishedAt, contentHash) -> + val enclosureType = parsedItem.enclosures.firstOrNull()?.type + + database.articlesQueries.create( + id = parsedItem.id!!, + feed_id = feed.id, + title = parsedItem.title, + author = item.author, + content_html = parsedItem.contentHTML, + url = parsedItem.url, + summary = item.summary, + extracted_content_url = null, + image_url = parsedItem.imageURL, + published_at = publishedAt, + enclosure_type = enclosureType, + content_hash = contentHash, ) - val withinCutoff = cutoffDate == null || publishedAt > cutoffDate.toEpochSecond() - val blocked = containsFilteredText(parsedItem, filters) - - if (parsedItem.id != null && withinCutoff && !blocked) { - val enclosureType = parsedItem.enclosures.firstOrNull()?.type - - database.articlesQueries.create( - id = parsedItem.id, - feed_id = feed.id, - title = parsedItem.title, - author = item.author, - content_html = parsedItem.contentHTML, - url = parsedItem.url, - summary = item.summary, - extracted_content_url = null, - image_url = parsedItem.imageURL, - published_at = publishedAt, - enclosure_type = enclosureType, - ) + articleRecords.createStatus( + articleID = parsedItem.id, + updatedAt = updatedAt, + read = false, + ) - articleRecords.createStatus( + parsedItem.enclosures.forEach { + enclosureRecords.create( + url = it.url.toString(), + type = it.type, articleID = parsedItem.id, - updatedAt = updatedAt, - read = false + itunesDurationSeconds = it.itunesDurationSeconds?.toString(), + itunesImage = it.itunesImage, ) - - parsedItem.enclosures.forEach { - enclosureRecords.create( - url = it.url.toString(), - type = it.type, - articleID = parsedItem.id, - itunesDurationSeconds = it.itunesDurationSeconds?.toString(), - itunesImage = it.itunesImage, - ) - } } } } @@ -340,6 +354,13 @@ internal class LocalAccountDelegate( } } +private data class ParsedArticle( + val parsedItem: ParsedItem, + val item: RssItem, + val publishedAt: Long, + val contentHash: String, +) + internal val RssItem.contentHTML: String? get() { val currentContent = content.orEmpty().ifBlank { diff --git a/capy/src/main/java/com/jocmp/capy/accounts/miniflux/MinifluxAccountDelegate.kt b/capy/src/main/java/com/jocmp/capy/accounts/miniflux/MinifluxAccountDelegate.kt index bcd70327f..333c1e8c1 100644 --- a/capy/src/main/java/com/jocmp/capy/accounts/miniflux/MinifluxAccountDelegate.kt +++ b/capy/src/main/java/com/jocmp/capy/accounts/miniflux/MinifluxAccountDelegate.kt @@ -408,6 +408,7 @@ internal class MinifluxAccountDelegate( image_url = imageURL, published_at = entry.published_at.toDateTime?.toEpochSecond(), enclosure_type = enclosures.firstOrNull()?.mime_type, + content_hash = null, ) articleRecords.createStatus( diff --git a/capy/src/main/java/com/jocmp/capy/accounts/reader/ReaderAccountDelegate.kt b/capy/src/main/java/com/jocmp/capy/accounts/reader/ReaderAccountDelegate.kt index 0ba42e04a..7456dd89c 100644 --- a/capy/src/main/java/com/jocmp/capy/accounts/reader/ReaderAccountDelegate.kt +++ b/capy/src/main/java/com/jocmp/capy/accounts/reader/ReaderAccountDelegate.kt @@ -529,6 +529,7 @@ internal class ReaderAccountDelegate( image_url = ReaderEnclosureParsing.parsedImageURL(item), published_at = item.published, enclosure_type = enclosureType, + content_hash = null, ) articleRecords.updateStatus( diff --git a/capy/src/main/java/com/jocmp/capy/common/MD5.kt b/capy/src/main/java/com/jocmp/capy/common/MD5.kt new file mode 100644 index 000000000..b3f9c4b67 --- /dev/null +++ b/capy/src/main/java/com/jocmp/capy/common/MD5.kt @@ -0,0 +1,12 @@ +package com.jocmp.capy.common + +import java.security.MessageDigest + +@OptIn(ExperimentalStdlibApi::class) +object MD5 { + fun from(value: String): String { + val md = MessageDigest.getInstance("MD5") + val digest = md.digest(value.toByteArray()) + return digest.toHexString() + } +} diff --git a/capy/src/main/java/com/jocmp/capy/persistence/ArticleMapper.kt b/capy/src/main/java/com/jocmp/capy/persistence/ArticleMapper.kt index e39721a6e..bb9dcb549 100644 --- a/capy/src/main/java/com/jocmp/capy/persistence/ArticleMapper.kt +++ b/capy/src/main/java/com/jocmp/capy/persistence/ArticleMapper.kt @@ -17,6 +17,7 @@ internal fun articleMapper( imageURL: String?, publishedAt: Long?, enclosureType: String?, + @Suppress("UNUSED_PARAMETER") contentHash: String?, feedTitle: String?, faviconURL: String?, enableStickyContent: Boolean, @@ -85,6 +86,7 @@ internal fun listMapper( imageURL = imageURL, publishedAt = publishedAt, enclosureType = enclosureType, + contentHash = null, feedTitle = feedTitle, faviconURL = faviconURL, enableStickyContent = false, diff --git a/capy/src/main/sqldelight/com/jocmp/capy/db/24_AddContentHashToArticles.sqm b/capy/src/main/sqldelight/com/jocmp/capy/db/24_AddContentHashToArticles.sqm new file mode 100644 index 000000000..0d56514e5 --- /dev/null +++ b/capy/src/main/sqldelight/com/jocmp/capy/db/24_AddContentHashToArticles.sqm @@ -0,0 +1,3 @@ +ALTER TABLE articles ADD COLUMN content_hash TEXT; + +CREATE INDEX articles_feed_content_hash ON articles(feed_id, content_hash); diff --git a/capy/src/main/sqldelight/com/jocmp/capy/db/articles.sq b/capy/src/main/sqldelight/com/jocmp/capy/db/articles.sq index 15e3fc9a1..d091358f0 100644 --- a/capy/src/main/sqldelight/com/jocmp/capy/db/articles.sq +++ b/capy/src/main/sqldelight/com/jocmp/capy/db/articles.sq @@ -65,7 +65,8 @@ INSERT INTO articles( summary, image_url, published_at, - enclosure_type + enclosure_type, + content_hash ) VALUES ( :id, @@ -78,7 +79,8 @@ VALUES ( :summary, :image_url, :published_at, -:enclosure_type +:enclosure_type, +:content_hash ) ON CONFLICT(id) DO UPDATE SET @@ -92,7 +94,8 @@ url = excluded.url, summary = excluded.summary, image_url = excluded.image_url, published_at = published_at, -enclosure_type = excluded.enclosure_type; +enclosure_type = excluded.enclosure_type, +content_hash = excluded.content_hash; createStatus: INSERT INTO article_statuses( @@ -282,6 +285,11 @@ deletePageByID { DELETE FROM articles WHERE id = :articleID; } +findExistingHashes: +SELECT content_hash FROM articles +WHERE feed_id = :feedID +AND content_hash IN :contentHashes; + findIDsByFeed: SELECT id FROM articles WHERE feed_id = :feedID; diff --git a/capy/src/test/java/com/jocmp/capy/fixtures/ArticleFixture.kt b/capy/src/test/java/com/jocmp/capy/fixtures/ArticleFixture.kt index 2138e02b1..7753e0843 100644 --- a/capy/src/test/java/com/jocmp/capy/fixtures/ArticleFixture.kt +++ b/capy/src/test/java/com/jocmp/capy/fixtures/ArticleFixture.kt @@ -33,7 +33,8 @@ class ArticleFixture(private val database: Database = InMemoryDatabaseProvider() published_at = publishedAt, summary = summary, url = url, - enclosure_type = null + enclosure_type = null, + content_hash = null, ) database.articlesQueries.createStatus( article_id = id, diff --git a/capy/src/test/java/com/jocmp/capy/persistence/ArticleMapperTest.kt b/capy/src/test/java/com/jocmp/capy/persistence/ArticleMapperTest.kt index 2110fc4a2..dee8f89d4 100644 --- a/capy/src/test/java/com/jocmp/capy/persistence/ArticleMapperTest.kt +++ b/capy/src/test/java/com/jocmp/capy/persistence/ArticleMapperTest.kt @@ -20,6 +20,7 @@ class ArticleMapperTest { imageURL = "https://cdn.vox-cdn.com/thumbor/r-eWiuX74LfGvTxwenExmwmkPlk=/0x0:1800x1200/1310x873/cdn.vox-cdn.com/uploads/chorus_image/image/73010063/Vizio_TV_D_Series_Lifestyle.0.jpg", publishedAt = 1703960809, enclosureType = null, + contentHash = null, feedTitle = "", faviconURL = null, enableStickyContent = false,