Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions semantic-layer-invoice-line-item-extraction/.prettierignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Ignore artifacts:
build
coverage

**/lib
package-lock.json
1 change: 1 addition & 0 deletions semantic-layer-invoice-line-item-extraction/.prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
61 changes: 61 additions & 0 deletions semantic-layer-invoice-line-item-extraction/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Semantic Layer Sample: Invoice Line Item Extraction

A command-line tool that uploads an invoice PDF file to Vertesia and triggers the semantic layer analysis.

## Features

- Upload a PDF and get a structured XML conversion that includes images OCR and also captures the meaning of the page layouts.
- Extract line items as a table in a custom format

## Installation

### Prerequisites

- Git
- Node.js (v18 or higher)
- npm
- A Vertesia Account and API Key
- A Vertesia semantic layer subscription

### Setup

1. Clone this repository:

```bash
git clone https://github.com/vertesia/examples.git
cd semantic-layer-invoice-line-item-extraction
```

2. Install dependencies:

```bash
npm install
```

3. Create a `.env` file with your api key:

```bash
cat << EOF > .env
STUDIO_URL=https://studio-server-production.api.becomposable.com
ZENO_URL=https://zeno-server-production.api.becomposable.com
API_KEY=
EOF
```

## Usage

Basic usage with the provided sample file:

```bash
npm run start
```

or with your own pdf file

```bash
npx bun ./index.ts -- -i <your_file>
```

## Contributing

Contributions are welcome! Please feel free to submit a Pull Request.
29 changes: 29 additions & 0 deletions semantic-layer-invoice-line-item-extraction/eslint.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import js from "@eslint/js";
import globals from "globals";
import tseslint from "typescript-eslint";
import { defineConfig, globalIgnores } from "eslint/config";
import markdown from "@eslint/markdown";

export default defineConfig([
globalIgnores(["lib/*"]),
{
files: ["**/*.{js,mjs,cjs,ts}"],
plugins: { js },
extends: ["js/recommended"],
},
{
files: ["**/*.{js,mjs,cjs,ts}"],
languageOptions: { globals: globals.browser },
},
{
files: ["**/*.md"],
plugins: {
markdown,
},
language: "markdown/gfm",
rules: {
"markdown/no-html": "error",
},
},
tseslint.configs.recommended,
]);
Binary file not shown.
Binary file not shown.
149 changes: 149 additions & 0 deletions semantic-layer-invoice-line-item-extraction/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import path from "node:path";
import * as dotenv from "dotenv";
import { Command } from "commander";
import { StreamSource, VertesiaClient } from "@vertesia/client";
import { createReadStream } from "fs";
import { createReadableStreamFromReadable } from "node-web-stream-adapters";
import {
DocAnalyzeRunStatusResponse,
WorkflowExecutionStatus,
} from "@vertesia/common";
import { S } from "fluent-json-schema";
import papaparse from "papaparse";

// Set up command-line interface with Commander
const program = new Command();
program
.name("semantic-layer-sample")
.description(
"Analyze a PDF file and generate a structured representation in XML",
)
.version("0.0.1")
.requiredOption("-i, --input <file>", "The PDF file to upload and analyze")
.parse(process.argv);

const options = program.opts();

// Load environment variables from .env file
dotenv.config();
const apikey = process.env.API_KEY;

// Initialize client
const client = new VertesiaClient({
storeUrl: process.env.ZENO_URL as string,
serverUrl: process.env.STUDIO_URL as string,
apikey,
});

// Upload PDF file
const filename = path.basename(options.input);
if (!filename.toLowerCase().endsWith(".pdf")) {
console.error("The input file must be a PDF file.");
process.exit(1);
}

const stream = createReadStream(options.input);
const content = new StreamSource(
createReadableStreamFromReadable(stream),
path.basename(options.input),
"application/pdf",
);
const object = await client.objects.create({
content: content,
});

console.log("Created object:", object.id);

// Run analysis
const analysisRun = await client.objects.analyze(object.id).start({
features: [],
});
console.log("Analysis Started", analysisRun);

// Get Status
const delay = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));

const printProgress = (progress: string) => {
process.stdout.clearLine(0);
process.stdout.cursorTo(0);
process.stdout.write(progress);
};

let analysisStatus: DocAnalyzeRunStatusResponse;
do {
await delay(5000);
analysisStatus = await client.objects.analyze(object.id).getStatus();
printProgress(`Progress: ${analysisStatus.progress?.percent} %`);
} while (analysisStatus.status === WorkflowExecutionStatus.RUNNING);

if (analysisStatus.status !== WorkflowExecutionStatus.COMPLETED) {
console.error(`\nAnalysis failed with status: ${analysisStatus.status}`);
process.exit(-1);
}

console.log(`\nAnalysis was completed successfully`);

// Get Results
// const results = await client.objects.analyze(object.id).getResults();
// console.log(results.document);

console.log(`Converting line item tables to csv file`);

const target_schema = S.object()
.title("Invoice line item schema")
.description("A line item")
.prop(
"line_item_number",
S.string().description(
"A simple identifier number for the line item which is unique and incremental",
),
)
.prop("hs_code", S.string())
.prop("product_code", S.string())
.prop("description", S.string())
.prop("country_of_origin", S.string())
.prop("quantity", S.number().minimum(0))
.prop("unit_price", S.number().minimum(0))
.prop("amount", S.number().minimum(0))
.valueOf();

console.log("Target Schema", target_schema);

const instructions = `A valid invoice line item table features rows such as description, quantity, unit price, and amount columns.`;

const adaptTablesRun = await client.objects.analyze(object.id).adaptTables({
instructions: instructions,
item_name: "invoice line item",
target_schema: JSON.stringify(target_schema),
});
console.log(adaptTablesRun);

if (!adaptTablesRun.workflow_run_id) {
console.error("Cannot continue, workflow run ID is missing");
process.exit(-1);
}

const status = await client.objects
.analyze(object.id)
.getAdaptedTables(adaptTablesRun.workflow_run_id, { format: "json" });

console.log(status);

const stats = [];
let allLineItems: object[] = [];
for (const property in status) {
stats.push({
tableId: property,
title: status[property].comment,
nbItems: status[property].data.length,
});
allLineItems = allLineItems.concat(status[property].data);
}

console.log("Processed Tables", stats);

console.log("Number of line items: ", allLineItems.length);

const csv = papaparse.unparse(allLineItems);

console.log(csv);
Loading