diff --git a/.gitignore b/.gitignore index 1ecaea1..e87488f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.log .DS_Store .fetch-cache.json +.env # Test runner output connector-result.json diff --git a/README.md b/README.md index af0fa9f..56abe5d 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,11 @@ Playwright-based data connectors for [DataConnect](https://github.com/vana-com/d |----------|---------|---------|--------| | ChatGPT | OpenAI | playwright | chatgpt.conversations, chatgpt.memories | | GitHub | GitHub | playwright | github.profile, github.repositories, github.starred | -| Instagram | Meta | playwright | instagram.profile, instagram.posts | +| [H-E-B](heb/) | HEB | playwright | heb.profile, heb.orders, heb.nutrition | +| Instagram | Meta | playwright | instagram.profile, instagram.posts, instagram.ads | | LinkedIn | LinkedIn | playwright | linkedin.profile, .experience, .education, .skills, .languages | +| Oura Ring | Oura | playwright | oura.readiness, oura.sleep, oura.activity | +| Shop | Shopify | playwright | shop.orders | | Spotify | Spotify | playwright | spotify.profile, spotify.savedTracks, spotify.playlists | | YouTube | Google | playwright | youtube.profile, youtube.subscriptions, youtube.playlists, youtube.playlistItems, youtube.likes, youtube.watchLater, youtube.history (top 50 recent items) | @@ -24,6 +27,10 @@ connectors/ ├── schemas/ # JSON schemas for exported data │ ├── chatgpt.conversations.json │ └── ... +├── heb/ +│ ├── heb-playwright.js # Connector script +│ ├── heb-playwright.json # Metadata +│ └── README.md # Setup (USDA API key) ├── openai/ │ ├── chatgpt-playwright.js # Connector script │ └── chatgpt-playwright.json # Metadata diff --git a/heb/README.md b/heb/README.md new file mode 100644 index 0000000..e0e48b2 --- /dev/null +++ b/heb/README.md @@ -0,0 +1,24 @@ +# H-E-B Connector + +Exports your H-E-B account profile, order history, and product nutrition data. + +## Scopes + +| Scope | Description | +|-------|-------------| +| `heb.profile` | Name, email, phone, delivery addresses | +| `heb.orders` | Curbside and delivery orders with items, quantities, prices, dates | +| `heb.nutrition` | Nutrition facts per product (calories, macros, sodium, fiber, vitamins) | + +## Environment variables + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `USDA_API_KEY` | No | `DEMO_KEY` | [USDA FoodData Central](https://fdc.nal.usda.gov/api-guide) API key for nutrition lookup. The demo key works but is rate-limited. Get a free key at https://fdc.nal.usda.gov/api-key-signup | + +## How nutrition lookup works + +1. Try the product's UPC barcode against USDA FDC (exact match) +2. If no UPC match, search by product name against Branded foods (top 5 results scored with `scoreMatch()`) +3. If branded results are poor, fall back to Foundation data type (for produce, staples, etc.) +4. HEB's own nutrition data is used when available; incomplete HEB data (null calories/macros) falls through to USDA while preserving HEB ingredients, allergens, and category diff --git a/heb/heb-playwright.js b/heb/heb-playwright.js new file mode 100644 index 0000000..9aa3425 --- /dev/null +++ b/heb/heb-playwright.js @@ -0,0 +1,857 @@ +/** + * H-E-B Connector (Playwright) + * + * Exports: + * - heb.profile — account info (name, email, phone, delivery addresses) + * - heb.orders — curbside/delivery order history (item names, quantities, prices, dates) + * - heb.nutrition — nutrition facts per unique product (Schema.org NutritionInformation aligned) + * + * Extraction strategy: + * Order history: DOM scrape of /my-account/your-orders?page=N + * Order detail: DOM scrape of /my-account/order-history/{orderId} + * → a[href*="/product-detail"] links contain productId + name + * → li.innerText contains "Quantity: X. Price: $Y" + * Nutrition: DOM scrape of /product-detail/{slug}/{id} + * → h3 "Nutrition Facts" → ul > li innerText + * + * Known constraints: + * - Only curbside/delivery orders appear in account history (not in-store) + * - In-store gap: user can upload receipt photos in the app (not handled here) + */ + +// ─── Login Detection ────────────────────────────────────── + +const checkLoginStatus = async () => { + try { + return await page.evaluate(` + (() => { + const hasLoginForm = !!document.querySelector('input[type="password"]') || + !!document.querySelector('form[action*="sign-in"], form[action*="login"]'); + if (hasLoginForm) return false; + + const url = window.location.href; + if (url.includes('/challenge') || url.includes('/checkpoint') || + url.includes('/sign-in') || url.includes('/login')) return false; + + return !!( + document.querySelector('button[aria-label*="account" i]') || + document.querySelector('a[href*="/my-account"]') || + document.querySelector('button[aria-label="My account"]') + ); + })() + `); + } catch (e) { + return false; + } +}; + +// ─── Profile ───────────────────────────────────────────── + +const scrapeProfile = async () => { + await page.goto('https://www.heb.com/my-account/profile'); + await page.sleep(1500); + + return await page.evaluate(` + (() => { + // Extract label→value pairs from the personal info section + const getField = (label) => { + const p = Array.from(document.querySelectorAll('p')) + .find(el => el.textContent.trim() === label); + return p?.nextElementSibling?.textContent?.trim() || null; + }; + + // Delivery addresses + const addresses = []; + document.querySelectorAll('main > div > div').forEach(card => { + const addrEl = card.querySelector('p'); + if (!addrEl) return; + const text = card.innerText.trim(); + if (!text.match(/[A-Z]{2}\\s+\\d{5}/)) return; // must contain state + zip + const labelEl = card.querySelector('div'); + const isPrimary = !!card.querySelector('[aria-label*="Primary"], span:not([class])'); + addresses.push({ + label: labelEl?.firstChild?.textContent?.trim() || null, + address: addrEl.innerText.replace(/\\s+/g, ' ').trim(), + isPrimary: card.innerText.includes('Primary'), + }); + }); + + return { + name: getField('Name'), + email: getField('Email'), + phone: getField('Mobile number'), + deliveryAddresses: addresses, + }; + })() + `); +}; + +// ─── Order History ──────────────────────────────────────── + +const scrapeOrderListPage = async (pageNum) => { + await page.goto(`https://www.heb.com/my-account/your-orders?page=${pageNum}`); + await page.sleep(2000); + + return await page.evaluate(` + (() => { + const orders = []; + document.querySelectorAll('a[href*="/my-account/order-history/HEB"]').forEach(a => { + const href = a.href; + const orderId = href.split('/').pop(); + if (!orderId || orderId.startsWith('HEB') === false) return; + const card = a.closest('li') || a.parentElement; + const text = card ? card.innerText : ''; + const dateMatch = text.match(/([A-Z][a-z]+ \\d+, \\d{4})/); + const totalMatch = text.match(/\\$(\\d+\\.\\d+),?\\s*(\\d+)\\s*items?/i); + const statusMatch = text.match(/Status:\\s*([^\\n]+)/i); + const addressMatch = text.match(/(?:Delivery to|Curbside at)\\s+([^\\n]+)/i); + orders.push({ + orderId, + orderUrl: href, + orderDate: dateMatch ? dateMatch[1] : null, + total: totalMatch ? parseFloat(totalMatch[1]) : null, + itemCount: totalMatch ? parseInt(totalMatch[2]) : null, + status: statusMatch ? statusMatch[1].trim() : null, + address: addressMatch ? addressMatch[1].trim() : null, + }); + }); + // Check for next page + const hasNext = !!document.querySelector('a[aria-label*="next page" i], a[href*="?page="]'); + const paginationLinks = Array.from(document.querySelectorAll('nav[aria-label*="Pagination"] a[href*="page="]')); + const maxPage = paginationLinks.reduce((max, a) => { + const m = a.href.match(/page=(\\d+)/); + return m ? Math.max(max, parseInt(m[1])) : max; + }, 1); + return { orders, maxPage }; + })() + `); +}; + +const scrapeOrderDetail = async (orderId, orderUrl) => { + await page.goto(orderUrl); + await page.sleep(1500); + + return await page.evaluate(` + (() => { + const seen = new Set(); + const items = []; + document.querySelectorAll('a[href*="/product-detail"]').forEach(a => { + const name = a.textContent.trim(); + if (!name) return; + const href = a.href; + if (seen.has(href)) return; + seen.add(href); + const productId = href.split('/').pop(); + const li = a.closest('li'); + const text = li ? li.innerText : ''; + const qtyMatch = text.match(/Quantity:\\s*([^\\n.]+)/); + const priceMatch = text.match(/Price:\\s*\\$?([\\d.]+)/); + const padded = productId.padStart(9, '0'); + items.push({ + name, + productId, + productUrl: href, + imageUrl: 'https://images.heb.com/is/image/HEBGrocery/prd-small/' + padded + '.jpg', + quantity: qtyMatch ? qtyMatch[1].trim() : null, + price: priceMatch ? parseFloat(priceMatch[1]) : null, + }); + }); + + // Order metadata from page + const heading = document.querySelector('main h1, main h2'); + const dateEl = Array.from(document.querySelectorAll('main p, main div')) + .find(el => el.textContent.match(/[A-Z][a-z]+ \\d+, \\d{4}/)); + const orderDate = dateEl ? (dateEl.textContent.match(/([A-Z][a-z]+ \\d+, \\d{4})/) || [])[1] : null; + + return { items, orderDate }; + })() + `); +}; + +// ─── Bot Detection ─────────────────────────────────────── + +const detectBlock = async () => { + return await page.evaluate(` + (() => { + const url = window.location.href; + const title = document.title; + const html = document.documentElement.outerHTML; + + // Incapsula / Imperva (HEB's actual bot protection) + // Block page replaces entire document with a single iframe; all content is inside it. + // NOTE: _Incapsula_Resource is present on ALL HEB pages (it's their WAF). + // Only count as blocked when Incapsula is present AND the page is an empty shell. + const hasIncapsula = html.includes('_Incapsula_Resource') || + !!document.querySelector('iframe[src*="Incapsula"], script[src*="Incapsula"]'); + + // No real product content on the page (Incapsula serves an empty shell) + const hasProductContent = !!( + document.querySelector('h3') || + document.querySelector('nav[aria-label*="Breadcrumb"]') || + document.querySelector('[data-testid]') + ); + const isEmptyShell = !hasProductContent && + document.querySelectorAll('iframe').length > 0 && + (document.body?.children?.length || 0) <= 2; + + // Incapsula block = WAF script present AND page has no real content + const isIncapsulaBlocked = hasIncapsula && isEmptyShell; + + // DataDome (kept for robustness) + const hasDataDome = !!document.getElementById('datadome'); + + // Generic captcha signals + const hasCaptcha = !!( + document.querySelector('iframe[src*="captcha"]') || + document.querySelector('[id*="captcha"]') || + document.querySelector('[class*="captcha"]') + ); + + const isBlockedUrl = ( + url.includes('geo.captcha-delivery.com') || + url.includes('/challenge') || + url.includes('/blocked') + ); + const isBlockedTitle = /captcha|verify|access.denied|are.you.human|security.check/i.test(title); + + const blocked = isIncapsulaBlocked || hasDataDome || hasCaptcha || isBlockedUrl || isBlockedTitle; + return blocked ? { blocked: true, url, title } : { blocked: false }; + })() + `); +}; + +// ─── USDA FoodData Central Fallback ───────────────────── + +const USDA_API_KEY = process.env.USDA_API_KEY || 'DEMO_KEY'; + +const cleanProductName = (name) => { + return name + .replace(/^(H-E-B|Hill Country Fare|Central Market|Meal Simple by H-E-B)\s+/i, '') + // Fractional sizes first (e.g. ", 1/2 gal") — must run before simple sizes + .replace(/,?\s*\d+\s*\/\s*\d+\s*(gal|oz|lb)$/i, '') + .replace(/,?\s*Avg\.\s*[\d.]+\s*(lb|lbs|oz)$/i, '') + // Trailing "12 oz Cans, 12 pk" or "12 Mega XL Rolls" + .replace(/,?\s*\d+\s*(oz|fl oz)\s*(Cans|Bottles|Boxes),?\s*\d+\s*(pk|ct)$/i, '') + .replace(/,?\s*\d+\s+(Mega\s+)?(XL\s+)?(Super\s+)?(Rolls|Bags|Pacs|Cans)$/i, '') + // Simple trailing size (e.g. ", 10 oz", ", 3 ct bag") or bare unit (", Each") + .replace(/,?\s*(\d+(\.\d+)?\s*)?(oz|lb|lbs|fl oz|gal|ct|pk|count|each|bundle|bag)(\s+bag)?\.?$/i, '') + .trim(); +}; + +// Score how well a USDA result matches our query (0–1) +const scoreMatch = (query, food) => { + const qWords = query.toLowerCase().split(/\s+/).filter(w => w.length > 2); + if (qWords.length === 0) return 0; + const desc = (food.description || '').toLowerCase(); + const brand = (food.brandName || food.brandOwner || '').toLowerCase(); + const haystack = `${desc} ${brand}`; + let hits = 0; + for (const w of qWords) { + if (haystack.includes(w)) hits++; + } + return hits / qWords.length; +}; + +// Pick the best-scoring food from a list, above a minimum threshold +const bestMatch = (query, foods, minScore = 0.4) => { + let best = null, bestScore = 0; + for (const food of foods) { + const s = scoreMatch(query, food); + if (s > bestScore) { best = food; bestScore = s; } + } + return bestScore >= minScore ? best : null; +}; + +const lookupUSDA = async (name, upc) => { + // Try UPC first (deterministic match) + if (upc) { + // Try as-is first, then zero-padded to GTIN-14 format + const formats = [upc]; + if (upc.length < 14) formats.push(upc.padStart(14, '0')); + if (upc.length < 13) formats.push(upc.padStart(13, '0')); + + for (const fmt of formats) { + const upcRes = await page.httpFetch( + `https://api.nal.usda.gov/fdc/v1/foods/search?query=gtinUpc:${fmt}&pageSize=1&api_key=${USDA_API_KEY}`, + { timeout: 10000 } + ); + if (upcRes.ok && upcRes.json?.foods?.[0]) { + return { food: upcRes.json.foods[0], matchMethod: 'upc' }; + } + } + } + + // Fall back to text search with match validation + const cleaned = cleanProductName(name); + if (!cleaned || cleaned.length < 3) return null; + + // Try Branded first (most grocery products) + const brandedRes = await page.httpFetch( + `https://api.nal.usda.gov/fdc/v1/foods/search?query=${encodeURIComponent(cleaned)}&pageSize=5&dataType=Branded&api_key=${USDA_API_KEY}`, + { timeout: 10000 } + ); + if (brandedRes.ok && brandedRes.json?.foods?.length) { + const match = bestMatch(cleaned, brandedRes.json.foods); + if (match) return { food: match, matchMethod: 'text' }; + } + + // Fall back to Foundation (better for produce, staples, generic items) + const foundationRes = await page.httpFetch( + `https://api.nal.usda.gov/fdc/v1/foods/search?query=${encodeURIComponent(cleaned)}&pageSize=5&dataType=Foundation&api_key=${USDA_API_KEY}`, + { timeout: 10000 } + ); + if (foundationRes.ok && foundationRes.json?.foods?.length) { + const match = bestMatch(cleaned, foundationRes.json.foods, 0.3); + if (match) return { food: match, matchMethod: 'text_foundation' }; + } + + return null; +}; + +const mapUSDANutrients = (food, matchMethod) => { + const getRaw = (id) => { + const n = food.foodNutrients?.find(fn => fn.nutrientId === id); + return n ? n.value : null; + }; + + // USDA FDC branded food nutrients are per 100g. Scale to per-serving. + const srvG = food.servingSize || 100; + const scale = srvG / 100; + const get = (id) => { + const raw = getRaw(id); + return raw != null ? Math.round(raw * scale * 100) / 100 : null; + }; + + return { + '@type': 'https://schema.org/NutritionInformation', + source: 'usda_fdc', + confidence: matchMethod === 'upc' ? 'high' : matchMethod === 'text' ? 'medium' : 'low', + usdaFdcId: food.fdcId, + usdaMatchMethod: matchMethod, + usdaDescription: food.description, + usdaBrand: food.brandOwner || null, + servingSize: food.servingSize ? `${food.servingSize}${food.servingSizeUnit || 'g'}` : null, + servingsPerContainer: null, + calories: get(1008), + protein_g: get(1003), + carbs_g: get(1005), + fat_g: get(1004), + saturated_fat_g: get(1258), + trans_fat_g: get(1257), + cholesterol_mg: get(1253), + fiber_g: get(1079), + sugar_g: get(2000), + added_sugar_g: get(1235), + sodium_mg: get(1093), + potassium_mg: get(1092), + calcium_mg: get(1087), + iron_mg: get(1089), + vitamin_d_mcg: get(1114), + ingredients: food.ingredients || null, + allergens: null, + highlights: [], + category: [], + }; +}; + +// ─── Nutrition ──────────────────────────────────────────── + +const productImageUrl = (productId) => { + // HEB image CDN: productId zero-padded to 9 digits for thumbnails, + // or plain productId for the full gallery format + const padded = String(productId).padStart(9, '0'); + return { + thumbnail: `https://images.heb.com/is/image/HEBGrocery/prd-small/${padded}.jpg`, + full: `https://images.heb.com/is/image/HEBGrocery/${productId}-1`, + }; +}; + +const scrapeNutrition = async (productUrl, productId) => { + try { + await page.goto(productUrl); + } catch (e) { + return { source: 'error', confidence: 'low', error: String(e).split('\n')[0] }; + } + await page.sleep(2500); + + // Check for bot detection before scraping + const blockStatus = await detectBlock(); + if (blockStatus.blocked) { + return { source: 'blocked', confidence: 'low', pageTitle: blockStatus.title }; + } + + return await page.evaluate(` + (() => { + // Collect gallery images (needed as fallback when no text nutrition panel exists) + const pid = ${JSON.stringify(String(productId))}; + const galleryImgs = Array.from(document.querySelectorAll('img')) + .filter(img => img.src?.includes(pid) && img.alt && !img.alt.toLowerCase().includes('advertisement')) + .map(img => ({ src: img.src, alt: img.alt })) + .slice(0, 5); + + // Extract UPC/GTIN if available (structured data, meta tags, or product details) + let upc = null; + // Check JSON-LD structured data — walk all nodes recursively + const findGtin = (obj) => { + if (!obj || typeof obj !== 'object') return null; + // Check this node directly + if (obj.gtin12 || obj.gtin13 || obj.gtin || obj.gtin8) { + return obj.gtin12 || obj.gtin13 || obj.gtin || obj.gtin8; + } + // Recurse into arrays and object values + if (Array.isArray(obj)) { + for (const item of obj) { + const found = findGtin(item); + if (found) return found; + } + } else { + for (const val of Object.values(obj)) { + if (typeof val === 'object') { + const found = findGtin(val); + if (found) return found; + } + } + } + return null; + }; + document.querySelectorAll('script[type="application/ld+json"]').forEach(script => { + if (upc) return; + try { + const data = JSON.parse(script.textContent); + upc = findGtin(data); + } catch {} + }); + // Check meta tags + if (!upc) { + const metaUpc = document.querySelector('meta[property="product:upc"], meta[name="upc"], meta[itemprop="gtin13"], meta[itemprop="gtin12"], meta[itemprop="gtin"]'); + if (metaUpc) upc = metaUpc.content; + } + // Check visible product details (common pattern: "UPC: 012345678901") + if (!upc) { + const allText = document.body.innerText; + const upcMatch = allText.match(/(?:UPC|GTIN|Barcode|Item #)[:\\s]*([\\d-]{8,14})/i); + if (upcMatch) upc = upcMatch[1].replace(/-/g, ''); + } + + // Try to parse text nutrition panel + const h3 = Array.from(document.querySelectorAll('h3')) + .find(el => el.textContent.includes('Nutrition Facts')); + + if (!h3) { + return { source: 'not_found', confidence: 'low', upc }; + } + + const list = h3.closest('div')?.querySelector('ul'); + if (!list) return { source: 'not_found', confidence: 'low', upc }; + + let calories = null; + const nutrients = {}; + + // HEB nutrition panel DOM structure (as of 2026): + //
  • Amount Per Serving
  • + //
  • Calories70
    Calories from Fat45
  • + //
  • Total Fat
  • + //
  • Protein
  • + // + // Strategy: for each
  • , check for the calorie pattern first (span + adjacent text), + // then extract nutrient name from and value from nested