1
0
mirror of https://github.com/jdiaz97/pdfx.git synced 2025-11-23 21:44:56 +02:00
Files
pdfx/src/extractor.rs
2025-06-27 06:10:50 +02:00

133 lines
4.4 KiB
Rust

use anyhow::{Result, anyhow};
use serde_json::{json, Value};
use reqwest;
use pdf_extract;
#[derive(Debug)]
pub struct PdfExtractor {
groq_api_key: String,
}
impl PdfExtractor {
pub fn new(groq_api_key: String) -> Self {
Self { groq_api_key }
}
/// Extract information from a PDF using GROQ API based on a prompt
pub async fn extract_pdf_to_json(
&self,
pdf_path: &str,
prompt: &str,
) -> Result<Value> {
// Extract text from PDF
let text = self.extract_pdf_text(pdf_path)?;
// Make API call to GROQ
let result = self.call_groq_api(&text, prompt).await?;
Ok(result)
}
/// Extract text from PDF file
fn extract_pdf_text(&self, pdf_path: &str) -> Result<String> {
// let bytes = fs::read(pdf_path)?;
match pdf_extract::extract_text(&pdf_path) {
Ok(text) => Ok(text),
Err(e) => Err(anyhow!("Failed to extract PDF text: {}", e)),
}
}
/// Make API call to GROQ
async fn call_groq_api(&self, text: &str, prompt: &str) -> Result<Value> {
let client = reqwest::Client::new();
let url = "https://api.groq.com/openai/v1/chat/completions";
let request_body = json!({
"model": "meta-llama/llama-4-scout-17b-16e-instruct",
"response_format": {"type": "json_object"},
"messages": [
{
"role": "system",
"content": format!(
"You are a helpful assistant that extracts specific information from documents. The user wants you to: {}\n\nPlease return your response as valid JSON only, with no additional text or formatting.",
prompt
)
},
{
"role": "user",
"content": format!("Here is the document text:\n\n{}", text)
}
],
"temperature": 0.1,
"max_tokens": 2000
});
let response = client
.post(url)
.header("Content-Type", "application/json")
.header("Authorization", format!("Bearer {}", self.groq_api_key))
.json(&request_body)
.send()
.await?;
if response.status().is_success() {
let response_json: Value = response.json().await?;
// Extract content from response
if let Some(content) = response_json
.get("choices")
.and_then(|choices| choices.get(0))
.and_then(|choice| choice.get("message"))
.and_then(|message| message.get("content"))
.and_then(|content| content.as_str())
{
// Parse the content as JSON
match serde_json::from_str::<Value>(content) {
Ok(parsed_json) => Ok(parsed_json),
Err(e) => Ok(json!({
"error": format!("Failed to parse response: {}", e),
"raw_response": response_json
}))
}
} else {
Ok(json!({
"error": "Invalid response format",
"raw_response": response_json
}))
}
} else {
let status = response.status();
let error_text = response.text().await.unwrap_or_else(|_| "Unknown error".to_string());
Ok(json!({
"error": format!("API call failed: {}", status),
"message": error_text
}))
}
}
}
pub async fn extract_contact_info(api_key: String, pdf_file: &str, prompt: &str) -> Result<String> {
let extractor = PdfExtractor::new(api_key);
let result: Value = extractor.extract_pdf_to_json(pdf_file, prompt).await?;
let formatted = serde_json::to_string_pretty(&result)?;
Ok(formatted)
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_pdf_extractor_creation() {
let extractor = PdfExtractor::new("test_key".to_string());
assert_eq!(extractor.groq_api_key, "test_key");
}
#[test]
fn test_extract_pdf_text_nonexistent_file() {
let extractor = PdfExtractor::new("test_key".to_string());
let result = extractor.extract_pdf_text("nonexistent.pdf");
assert!(result.is_err());
}
}