Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,21 @@ gems list
gems imagen -t "Hi, can you create a 3d rendered image of a pig with wings and a top hat flying over a happy futuristic scifi city with lots of greenery?"
```

### Generate a Video

```sh
gems vidgen -t "Yo, generate a humble bumble golden retriever puppy running through a flower field."
```

### Text to speech

```sh
gems tts -t "Yo, Say 'Hello' with a humble bumble voice!."

# Then install `ffmpeg` and run:
ffmpeg -f s16le -ar 24000 -ac 1 -i output.pcm out.wav
```

### TUI mode

```sh
Expand All @@ -143,6 +158,8 @@ gems
| `vision` | Analyze an image and generate content from text. |
| `stream` | Stream the generation of content. |
| `imagen` | Generate an image. |
| `vidgen` | Generate a video. |
| `tts` | Text to speech. |
| `count` | Count the number of tokens in a text. |
| `embed` | Embed content into a specified model. |
| `batch` | Batch embed multiple contents. |
Expand Down
33 changes: 33 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,24 @@ EXAMPLES:
Batch embed contents:
gems -m 'embedding-001' batch -t "Write a story about a magic backpack.","Generate a poem about nature."

Generate an Image:
gems imagen -t "Yo, create a 3d rendered image of a cat with wings."

Generate a Video:
gems vidgen -t "Yo, generate a humble bumble golden retriever puppy running through a flower field."

Text to speech:
gems tts -t "Yo, Say 'Hello' with a humble bumble voice!."

Get model info:
gems info

List models:
gems list

TUI mode:
gems

For more information, visit: github.com/kevin-rs/gems
"#
)]
Expand All @@ -104,6 +116,8 @@ pub enum Command {
Info(Info),
List(List),
Imagen(Imagen),
Vidgen(Vidgen),
Tts(Tts),
}

#[cfg(feature = "cli")]
Expand Down Expand Up @@ -172,3 +186,22 @@ pub struct Imagen {
#[arg(short, long, default_value_t = String::from("Hi, step bro... I need help generating a happy, humble, bumble Rustacean. he's stuck in the shower and won't compile."))]
pub text: String,
}

#[cfg(feature = "cli")]
#[derive(Args, Debug, Clone)]
pub struct Vidgen {
/// The text to generate video from.
#[arg(short, long, default_value_t = String::from("A humble bumble golden retriever puppy running through a flower field"))]
pub text: String,
}

#[cfg(feature = "cli")]
#[derive(Args, Debug, Clone)]
pub struct Tts {
/// The prompt.
#[arg(short, long, default_value_t = String::from("Say cheerfully: Have a wonderful day!"))]
pub text: String,
/// The voice.
#[arg(short, long, default_value_t = String::from("Kore"))]
pub voice: String,
}
15 changes: 14 additions & 1 deletion src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ use crate::models::Models;
use crate::stream::Streaming;
use crate::tokens::Tokens;
use crate::traits::CTrait;
use crate::tts::Tts;
use crate::vidgen::Videos;
use crate::vision::Visions;
use anyhow::anyhow;
use anyhow::Result;
Expand Down Expand Up @@ -66,7 +68,6 @@ impl CTrait for Client {
endpoint
)
};

let parsed_url = Url::parse_with_params(&full_url, &[("key", api_key)]).unwrap();

Ok(self
Expand Down Expand Up @@ -115,6 +116,18 @@ impl CTrait for Client {
client: self.clone(),
}
}

fn videos(&self) -> Videos {
Videos {
client: self.clone(),
}
}

fn tts(&self) -> Tts {
Tts {
client: self.clone(),
}
}
}

#[derive(Default)]
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ pub mod responses;
pub mod stream;
pub mod tokens;
pub mod traits;
pub mod tts;
pub mod utils;
pub mod vidgen;
pub mod vision;

#[cfg(feature = "cli")]
Expand Down
38 changes: 37 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ async fn main() -> Result<()> {
use gems::stream::StreamBuilder;
use gems::tokens::TokenBuilder;
use gems::traits::CTrait;
use gems::tts::TtsGenBuilder;
use gems::vidgen::VideoGenBuilder;

use gems::tui::run_tui;
use gems::utils::{
extract_text_from_partial_json, load_and_encode_image, type_with_cursor_effect,
Expand Down Expand Up @@ -183,7 +186,40 @@ async fn main() -> Result<()> {

let image_data = gemini_client.images().generate(params).await?;

std::fs::write("output.png", &image_data)?;
tokio::fs::write("output.png", &image_data).await?;
}
Some(Command::Vidgen(cmd)) => {
gemini_client.set_model(Model::Veo2);

let params = VideoGenBuilder::default()
.model(Model::Veo2)
.input(Message::User {
content: Content::Text(cmd.text),
name: None,
})
.build()
.unwrap();

let bytes = gemini_client.videos().generate(params).await?;

tokio::fs::write("output.mp4", &bytes).await?;
}
Some(Command::Tts(cmd)) => {
gemini_client.set_model(Model::Tts);

let params = TtsGenBuilder::default()
.model(Model::Tts)
.input(Message::User {
content: Content::Text(cmd.text),
name: None,
})
.voice(cmd.voice)
.build()
.unwrap();

let bytes = gemini_client.tts().generate(params).await?;

tokio::fs::write("output.pcm", &bytes).await?;
}
None => {
let _ = run_tui().await;
Expand Down
3 changes: 3 additions & 0 deletions src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub enum Model {
Embedding,
Imagen3,
Veo2,
Tts,
Flash20Live,
FlashExpImage,
}
Expand All @@ -36,6 +37,7 @@ impl ToString for Model {
Model::Embedding => "text-embedding-004",
Model::Imagen3 => "imagen-3.0-generate-002",
Model::Veo2 => "veo-2.0-generate-001",
Model::Tts => "gemini-2.5-flash-preview-tts",
Model::Flash20Live => "gemini-2.0-flash-live-001",
Model::FlashExpImage => "gemini-2.0-flash-exp-image-generation",
}
Expand All @@ -57,6 +59,7 @@ impl FromStr for Model {
"text-embedding-004" => Ok(Model::Embedding),
"imagen-3.0-generate-002" => Ok(Model::Imagen3),
"veo-2.0-generate-001" => Ok(Model::Veo2),
"gemini-2.5-flash-preview-tts" => Ok(Model::Tts),
"gemini-2.0-flash-live-001" => Ok(Model::Flash20Live),
"gemini-2.0-flash-exp-image-generation" => Ok(Model::Flash20Live),
_ => Err(anyhow!("Unknown model: {}", s)),
Expand Down
59 changes: 59 additions & 0 deletions src/requests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,62 @@ pub struct GenerationConfig {
#[serde(rename = "responseModalities")]
pub response_modalities: Vec<String>,
}

/// Request payload for video generation using Veo.
#[derive(Debug, Serialize)]
pub struct VideoGenRequest {
pub instances: Vec<VideoPrompt>,
pub parameters: VideoParameters,
}

/// The prompt inside the request.
#[derive(Debug, Serialize)]
pub struct VideoPrompt {
pub prompt: String,
}

/// Optional parameters for generation behavior.
#[derive(Debug, Serialize)]
pub struct VideoParameters {
#[serde(rename = "aspectRatio")]
pub aspect_ratio: String,

#[serde(rename = "personGeneration")]
pub person_generation: String,
}

#[derive(Debug, Serialize)]
pub struct TtsRequest {
pub model: String,
pub contents: Vec<Content>,
#[serde(rename = "generationConfig")]
pub generation_config: TtsGenerationConfig,
#[serde(skip_serializing_if = "Option::is_none")]
pub system_instruction: Option<Content>,
}

#[derive(Debug, Serialize)]
pub struct TtsGenerationConfig {
#[serde(rename = "responseModalities")]
pub response_modalities: Vec<String>,
#[serde(rename = "speechConfig")]
pub speech_config: SpeechConfig,
}

#[derive(Debug, Serialize)]
pub struct SpeechConfig {
#[serde(rename = "voiceConfig")]
pub voice_config: VoiceConfig,
}

#[derive(Debug, Serialize)]
pub struct VoiceConfig {
#[serde(rename = "prebuiltVoiceConfig")]
pub prebuilt_voice_config: PrebuiltVoiceConfig,
}

#[derive(Debug, Serialize)]
pub struct PrebuiltVoiceConfig {
#[serde(rename = "voiceName")]
pub voice_name: String,
}
77 changes: 75 additions & 2 deletions src/responses.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,13 +133,13 @@ pub struct ImagenResponse {

#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Candidate {
pub struct TtsCandidate {
pub content: Content,
pub finish_reason: Option<String>,
pub index: Option<i32>,
}

#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Content {
pub parts: Vec<Part>,
Expand All @@ -156,6 +156,10 @@ pub enum Part {
#[serde(rename = "inlineData")]
inline_data: ImageContent,
},
Media {
#[serde(rename = "inlineData")]
inline_data: InlineData,
},
}

#[derive(Debug, Serialize, Deserialize, Clone)]
Expand All @@ -179,3 +183,72 @@ pub struct PromptTokenDetail {
pub modality: Option<String>,
pub token_count: Option<i32>,
}

/// Response returned immediately after submitting the generation request.
#[derive(Debug, Deserialize)]
pub struct VideoGenResponse {
pub name: Option<String>,
}

/// Polling response to check operation status.
#[derive(Debug, Deserialize)]
pub struct OperationStatus {
pub done: Option<bool>,
pub error: Option<OperationError>,
pub response: Option<OperationResponse>,
}

/// Error details if the operation fails.
#[derive(Debug, Deserialize)]
pub struct OperationError {
pub message: String,
}

/// Successful operation result.
#[derive(Debug, Deserialize)]
pub struct OperationResponse {
pub output: VideoOutput,
}

/// Output payload containing the video.
#[derive(Debug, Deserialize)]
pub struct VideoOutput {
pub video: EncodedVideo,
}

/// The actual video content encoded in base64.
#[derive(Debug, Deserialize)]
pub struct EncodedVideo {
#[serde(rename = "mimeType")]
pub mime_type: String,

#[serde(rename = "base64Data")]
pub base64_data: String,
}

#[derive(Debug, Deserialize)]
pub struct ErrorWrapper {
pub error: ErrorMessage,
}

#[derive(Debug, Deserialize)]
pub struct ErrorMessage {
pub message: String,
}

#[derive(Debug, Deserialize)]
pub struct TtsResponse {
pub candidates: Option<Vec<Candidate>>,
}

#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct Candidate {
pub content: Content,
}

#[derive(Debug, Deserialize, Serialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct InlineData {
pub mime_type: String,
pub data: String,
}
4 changes: 4 additions & 0 deletions src/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use crate::models::Model;
use crate::models::Models;
use crate::stream::Streaming;
use crate::tokens::Tokens;
use crate::tts::Tts;
use crate::vidgen::Videos;
use crate::vision::Visions;
use anyhow::Result;
use reqwest::{Method, RequestBuilder};
Expand All @@ -23,4 +25,6 @@ pub trait CTrait {
fn stream(&self) -> Streaming;
fn models(&self) -> Models;
fn images(&self) -> Images;
fn videos(&self) -> Videos;
fn tts(&self) -> Tts;
}
Loading