clflushopt
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 44 additions & 1 deletion b/‎README.md‎
Lines changed: 44 additions & 1 deletion
diff --git a/‎glint/pom.xml‎
Lines changed: 19 additions & 2 deletions b/‎glint/pom.xml‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎glint/src/main/java/co/clflushopt/glint/App.java‎
Lines changed: 2 additions & 88 deletions b/‎glint/src/main/java/co/clflushopt/glint/App.java‎
Lines changed: 2 additions & 88 deletions
diff --git a/‎glint/src/main/java/co/clflushopt/glint/core/ExecutionContext.java‎
Lines changed: 7 additions & 1 deletion b/‎glint/src/main/java/co/clflushopt/glint/core/ExecutionContext.java‎
Lines changed: 7 additions & 1 deletion
@@ -98,4 +98,8 @@ TODO
 ### Datasets ###
 datasets/
 
+### Python Environment ###
+env/
+.venv/
+
 # End of https://www.toptal.com/developers/gitignore/api/java,maven,visualstudiocode,gradle
@@ -133,4 +133,47 @@ queries.
   - Uses vectorizewd Volcano-style iterator model
   - Processes data in batches for efficiency
   - Supports push-down optimizations
-  - Implements memory-efficient operations
+  - Implements memory-efficient operations
+
+  ### Running the examples
+
+You will probably want to use an IDE like IntelliJ or what I personally recommend VSCode with the
+Java pack at least for working with the codebase but you are free to use ed or nano as well.
+
+Running this thing will require Maven for no other reason than trying to run it without Maven
+has made me realize this will be the last and only time I write Java as a hobby or professionaly.
+
+If you don't want Maven; you should be able to figure it out.
+
+```sh
+
+$ export JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED"
+$ export MAVEN_OPTS="--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED"
+
+$ mvn compile exec:java --file glint/pom.xml
+```
+
+```
+Schema [fields=[(name: passenger_count, type: Int(32, true)), (name: MAX, type: FloatingPoint(SINGLE))]]
+
+Logical Plan:   Aggregate: groupExpr=[#passenger_count], aggregateExpr=[MAX(CAST(#fare_amount AS FLOAT))]
+    Scan: parquet_scan [projection=None]
+Optimized Plan: Aggregate: groupExpr=[#passenger_count], aggregateExpr=[MAX(CAST(#fare_amount AS FLOAT))]
+      Scan: parquet_scan [projection=None]
+
+Results:
+
+0,36090.3
+1,623259.9
+2,492.5
+3,350.0
+4,500.0
+5,760.0
+6,262.5
+7,78.0
+8,87.0
+9,92.0
+null,103.2
+
+Query took 2758 ms
+```
@@ -33,12 +33,17 @@
   <dependency>
       <groupId>org.apache.arrow</groupId>
       <artifactId>arrow-vector</artifactId>
-      <version>9.0.0</version>
+      <version>18.0.0</version>
   </dependency>
   <dependency>
       <groupId>org.apache.arrow</groupId>
       <artifactId>arrow-memory-netty</artifactId>
-      <version>9.0.0</version>
+      <version>18.0.0</version>
+  </dependency>
+  <dependency>
+      <groupId>org.apache.arrow</groupId>
+      <artifactId>arrow-dataset</artifactId>
+      <version>18.0.0</version>
   </dependency>
   <dependency>
     <groupId>com.google.protobuf</groupId>
@@ -145,6 +150,18 @@
           <artifactId>maven-project-info-reports-plugin</artifactId>
           <version>3.0.0</version>
         </plugin>
+        <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>exec-maven-plugin</artifactId>
+            <version>3.5.0</version>
+            <configuration>
+                <mainClass>co.clflushopt.glint.App</mainClass>
+                <arguments>
+                  <argument>--add-opens</argument>
+                  <argument>java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED</argument>
+                </arguments>
+            </configuration>
+        </plugin>
       </plugins>
     </pluginManagement>
   </build>
 
@@ -1,26 +1,8 @@
 package co.clflushopt.glint;
 
 import java.io.FileNotFoundException;
-import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Optional;
 
-import org.apache.arrow.vector.types.pojo.ArrowType;
-
-import co.clflushopt.glint.core.CsvReaderOptions;
-import co.clflushopt.glint.core.ExecutionContext;
-import co.clflushopt.glint.dataframe.DataFrame;
-import co.clflushopt.glint.query.logical.expr.AggregateExpr;
-import co.clflushopt.glint.query.logical.expr.CastExpr;
-import co.clflushopt.glint.query.logical.expr.ColumnExpr;
-import co.clflushopt.glint.query.logical.expr.LogicalExpr;
-import co.clflushopt.glint.query.logical.plan.LogicalPlan;
-import co.clflushopt.glint.query.optimizer.QueryOptimizer;
-import co.clflushopt.glint.types.ArrowTypes;
-import co.clflushopt.glint.types.Field;
-import co.clflushopt.glint.types.RecordBatch;
-import co.clflushopt.glint.types.Schema;
+import co.clflushopt.glint.examples.NYCYellowTrips;
 
 /**
  * Hello world!
@@ -30,78 +12,10 @@ public class App {
     public static void main(String[] args) {
         System.out.println("Welcome to the Glint query compiler");
         try {
-            nycTripsBenchmark(args);
+            NYCYellowTrips.runParquetExample();
         } catch (FileNotFoundException e) {
             e.printStackTrace();
         }
     }
 
-    public static void nycTripsBenchmark(String[] args) throws FileNotFoundException {
-        // Create execution context
-        ExecutionContext ctx = ExecutionContext.create().build();
-
-        long startTime = System.currentTimeMillis();
-        try {
-            // Define the schema for NYC Taxi dataset
-            Schema schema = new Schema(Arrays.asList(new Field("VendorID", ArrowTypes.Int32Type),
-                    new Field("tpep_pickup_datetime", ArrowTypes.StringType), // Could be Timestamp
-                    new Field("tpep_dropoff_datetime", ArrowTypes.StringType), // Could be Timestamp
-                    new Field("passenger_count", ArrowTypes.Int32Type),
-                    new Field("trip_distance", ArrowTypes.DoubleType),
-                    new Field("pickup_longitude", ArrowTypes.DoubleType),
-                    new Field("pickup_latitude", ArrowTypes.DoubleType),
-                    new Field("RatecodeID", ArrowTypes.Int32Type),
-                    new Field("store_and_fwd_flag", ArrowTypes.StringType),
-                    new Field("dropoff_longitude", ArrowTypes.DoubleType),
-                    new Field("dropoff_latitude", ArrowTypes.DoubleType),
-                    new Field("payment_type", ArrowTypes.Int32Type),
-                    new Field("fare_amount", ArrowTypes.DoubleType),
-                    new Field("extra", ArrowTypes.DoubleType),
-                    new Field("mta_tax", ArrowTypes.DoubleType),
-                    new Field("tip_amount", ArrowTypes.DoubleType),
-                    new Field("tolls_amount", ArrowTypes.DoubleType),
-                    new Field("improvement_surcharge", ArrowTypes.DoubleType),
-                    new Field("total_amount", ArrowTypes.DoubleType)));
-            // Create DataFrame and apply transformations
-            DataFrame df = ctx
-                    .readCsv("./datasets/yellow_tripdata_example.csv", Optional.of(schema),
-                            CsvReaderOptions.builder().delimiter(',').hasHeader(true).build())
-                    .aggregate(List.of(col("passenger_count")),
-                            List.of(max(cast(col("fare_amount"), ArrowTypes.FloatType))));
-
-            System.out.println("Logical Plan:\t" + LogicalPlan.format(df.getLogicalPlan()));
-            System.out.println("Schema:\t" + df.getSchema());
-
-            // Optimize and execute the plan
-            LogicalPlan optimizedPlan = QueryOptimizer.optimize(df.getLogicalPlan());
-            System.out.println("Optimized Plan:\t" + LogicalPlan.format(optimizedPlan));
-
-            // Execute and print results
-            Iterator<RecordBatch> results = ctx.execute(optimizedPlan);
-
-            while (results.hasNext()) {
-                RecordBatch batch = results.next();
-                System.out.println(batch.getSchema());
-                System.out.println(batch.toCsv());
-
-            }
-
-        } finally {
-            long endTime = System.currentTimeMillis();
-            System.out.println("Query took " + (endTime - startTime) + " ms");
-        }
-    }
-
-    // Helper methods for creating expressions
-    private static LogicalExpr col(String name) {
-        return new ColumnExpr(name);
-    }
-
-    private static LogicalExpr cast(LogicalExpr expr, ArrowType targetType) {
-        return new CastExpr(expr, targetType);
-    }
-
-    private static AggregateExpr max(LogicalExpr expr) {
-        return new AggregateExpr.Max(expr);
-    }
 }
@@ -9,6 +9,7 @@
 import co.clflushopt.glint.dataframe.DataFrame;
 import co.clflushopt.glint.dataframe.DataFrameImpl;
 import co.clflushopt.glint.datasource.CsvDataSource;
+import co.clflushopt.glint.datasource.DataSource;
 import co.clflushopt.glint.datasource.ParquetDataSource;
 import co.clflushopt.glint.query.logical.plan.LogicalPlan;
 import co.clflushopt.glint.query.logical.plan.Scan;
@@ -109,7 +110,12 @@ public DataFrame readCsv(String path, Schema schema, CsvReaderOptions options)
      * @param df
      */
     public DataFrame readParquet(String path, Optional<Schema> schema) {
-        var source = new ParquetDataSource(path);
+        DataSource source;
+        if (schema.isPresent()) {
+            source = new ParquetDataSource(path, schema.get());
+        } else {
+            source = new ParquetDataSource(path);
+        }
         return new DataFrameImpl(new Scan("parquet_scan", source, Collections.emptyList()));
     }