"""
Bulk Transcription Analysis Tool
This script helps you:
1. Fetch multiple transcription results from the CAMB.AI API
2. Analyze the transcriptions (duration, speakers, content)
3. Export transcriptions to CSV files
4. Find common speakers and words across all transcriptions
Before running:
- Replace "your-api-key" with your actual API key
- Install required packages: pip install requests
"""
import requests
import csv
from collections import Counter
import re
# ============================================================================
# CONFIGURATION - Update these values before running
# ============================================================================
# Your API authentication details
headers = {
"x-api-key": "your-api-key", # π REPLACE WITH YOUR ACTUAL API KEY
"Content-Type": "application/json",
}
# The API endpoint for fetching transcription results
API_URL = "https://client.camb.ai/apis/transcription-results"
# ============================================================================
# MAIN FUNCTIONS
# ============================================================================
def get_bulk_transcriptions(run_ids):
"""
Fetches multiple transcription results from the API in a single request.
This function sends a list of run IDs to the API and gets back all the
transcription data for those runs at once, which is much faster than
making individual requests for each run.
Parameters:
run_ids (list): A list of numbers representing the run IDs you want to fetch
Example: [12345, 12346, 12347]
Returns:
dict: A dictionary where each key is a run ID and each value contains
the transcription data for that run. Returns None if there's an error.
Example response structure:
{
"12345": {
"transcript": [
{"start": 0.0, "end": 2.5, "speaker": "Speaker_1", "text": "Hello world"},
{"start": 2.5, "end": 5.0, "speaker": "Speaker_2", "text": "How are you?"}
]
}
}
"""
try:
# Prepare the data to send to the API
payload = {"run_ids": run_ids}
print(f"π‘ Requesting transcriptions for {len(run_ids)} runs...")
# Make the API request
response = requests.post(
API_URL,
headers=headers,
json=payload,
)
# Check if the request was successful (status code 200-299)
response.raise_for_status()
# Convert the response from JSON to a Python dictionary
results = response.json()
# Show the raw response for debugging (you can comment this out later)
print("π Raw API response:")
print(results)
print()
print(f"β
Successfully retrieved {len(results)} transcription results")
return results
except requests.exceptions.RequestException as e:
# This handles network errors, API errors, etc.
print(f"β Error retrieving bulk transcriptions: {e}")
if hasattr(e, "response") and e.response is not None:
print(f"π Response details: {e.response.text}")
return None
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def format_time(seconds):
"""
Converts seconds to a readable HH:MM:SS format.
Parameters:
seconds (float): Time in seconds (e.g., 3661.5)
Returns:
str: Formatted time string (e.g., "01:01:01")
"""
hours = int(seconds // 3600) # Get whole hours
minutes = int((seconds % 3600) // 60) # Get remaining minutes
secs = int(seconds % 60) # Get remaining seconds
# Format with leading zeros (e.g., "01:05:30" instead of "1:5:30")
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
def safe_get_transcript(transcription_data):
"""
Safely extracts the transcript from API response data.
The API returns nested data, so we need to safely navigate to the transcript.
Parameters:
transcription_data (dict): The data for one transcription run
Returns:
list: List of transcript segments, or empty list if not found
"""
if not transcription_data:
return []
# Navigate safely: transcription_data -> "transcript" -> list of segments
return transcription_data.get("transcript", [])
# ============================================================================
# ANALYSIS FUNCTIONS
# ============================================================================
def analyze_single_transcription(transcription, run_id):
"""
Analyzes a single transcription and prints detailed statistics.
This function examines one transcription result and shows:
- How long the audio was
- How many dialogue segments there are
- Who the speakers are
- Sample dialogue from the beginning
Parameters:
transcription (list): List of dialogue segments from the API
run_id (str): The ID of this transcription run (for display purposes)
"""
# Check if we have any data to analyze
if not transcription:
print(f"π Run {run_id}: No transcription data available")
return
print(f"π Analyzing Run {run_id}:")
print("=" * 40)
# Calculate basic statistics
total_duration = max(segment["end"] for segment in transcription) if transcription else 0
total_segments = len(transcription)
# Find all unique speakers in this transcription
speakers = set()
for segment in transcription:
if "speaker" in segment:
speakers.add(segment["speaker"])
unique_speakers = len(speakers)
# Count how many times each speaker appears
speaker_counts = Counter()
for segment in transcription:
if "speaker" in segment:
speaker_counts[segment["speaker"]] += 1
# Find who spoke the most
most_frequent_speaker = speaker_counts.most_common(1)[0][0] if speaker_counts else "Unknown"
# Print all the statistics
print(f" π Total duration: {format_time(total_duration)}")
print(f" π¬ Total segments: {total_segments}")
print(f" π₯ Unique speakers: {unique_speakers}")
print(f" π€ Most frequent speaker: {most_frequent_speaker}")
# Show the first few lines of dialogue as examples
print(" π Sample dialogue:")
for i, segment in enumerate(transcription[:2]): # Show first 2 segments only
start_time = format_time(segment["start"])
end_time = format_time(segment["end"])
speaker = segment.get("speaker", "Unknown")
# Truncate long text to keep output readable
text = segment.get("text", "")
if len(text) > 50:
text = text[:50] + "..."
print(f" [{start_time} β {end_time}] {speaker}: {text}")
print() # Add blank line for readability
def analyze_bulk_transcriptions(transcription_results):
"""
Analyzes all transcription results and provides insights for each one.
This is the main analysis function that processes all the transcriptions
you fetched from the API.
Parameters:
transcription_results (dict): Dictionary of all transcription results from the API
"""
if not transcription_results:
print("β No transcription results to analyze")
return
print(f"\nπ Analyzing {len(transcription_results)} transcription results:")
print("=" * 60)
# Process each transcription one by one
for run_id in transcription_results:
# Safely extract the transcript data
transcript = safe_get_transcript(transcription_results.get(run_id, {}))
# Analyze this individual transcription
analyze_single_transcription(transcript, run_id)
# ============================================================================
# EXPORT FUNCTIONS
# ============================================================================
def export_transcription_to_csv(transcription, filename):
"""
Export a single transcription to a CSV file for further analysis.
Creates a CSV file with columns: start, end, speaker, text
This makes it easy to open in Excel or other spreadsheet programs.
Parameters:
transcription (list): List of dialogue segments
filename (str): Name of the output file (e.g., "transcription_12345.csv")
"""
if not transcription:
print(f"β οΈ No data to export for {filename}")
return
try:
# Open the file for writing
with open(filename, "w", newline="", encoding="utf-8") as csvfile:
# Define the column headers
fieldnames = ["start", "end", "speaker", "text"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# Write the header row
writer.writeheader()
# Write each dialogue segment as a row
for segment in transcription:
writer.writerow({
"start": segment.get("start", ""),
"end": segment.get("end", ""),
"speaker": segment.get("speaker", ""),
"text": segment.get("text", ""),
})
print(f"πΎ Exported transcription to {filename}")
except Exception as e:
print(f"β Error exporting to {filename}: {e}")
# ============================================================================
# CROSS-TRANSCRIPTION ANALYSIS FUNCTIONS
# ============================================================================
def extract_speakers_from_bulk(transcription_results):
"""
Finds all unique speakers across multiple transcription runs.
This helps you understand who appears in your audio content overall.
Parameters:
transcription_results (dict): Dictionary of all transcription results
Returns:
list: List of all unique speaker names found across all transcriptions
"""
all_speakers = set() # Use a set to automatically handle duplicates
# Go through each transcription
for run_id in transcription_results:
transcript = safe_get_transcript(transcription_results.get(run_id, {}))
if transcript:
# Extract speakers from this transcription
for segment in transcript:
if "speaker" in segment:
all_speakers.add(segment["speaker"])
return list(all_speakers) # Convert back to list
def find_common_words(transcription_results, min_length=5):
"""
Identifies frequently used words across multiple transcriptions.
This helps you understand the main topics and themes in your audio content.
Parameters:
transcription_results (dict): Dictionary of all transcription results
min_length (int): Minimum word length to consider (default: 5)
This filters out common short words like "the", "and", etc.
Returns:
Counter: A Counter object with word frequencies
"""
all_words = []
# Go through each transcription
for run_id in transcription_results:
transcript = safe_get_transcript(transcription_results.get(run_id, {}))
if transcript:
# Extract words from each dialogue segment
for segment in transcript:
if "text" in segment:
# Convert to lowercase for consistent counting
text = segment["text"].lower()
# Extract words (letters only, no punctuation)
words = re.findall(r"\b\w+\b", text)
# Only keep words that are long enough
filtered_words = [word for word in words if len(word) >= min_length]
all_words.extend(filtered_words)
# Count how many times each word appears
word_counts = Counter(all_words)
# Display the results
print("\nπ Most common words across all transcriptions:")
print("-" * 45)
for word, count in word_counts.most_common(10): # Show top 10
print(f" {word}: {count} occurrences")
return word_counts
# ============================================================================
# MAIN EXECUTION
# ============================================================================
if __name__ == "__main__":
"""
Main execution section - this runs when you execute the script directly.
Modify the run_ids list below with your actual run IDs.
"""
print("π Starting Bulk Transcription Analysis Tool")
print("=" * 50)
# β οΈ MODIFY THIS LIST WITH YOUR ACTUAL RUN IDs
run_ids = [12345, 12346, 12347, 12348]
print(f"π Processing run IDs: {run_ids}")
print()
# Step 1: Fetch all transcription results from the API
print("π Step 1: Fetching transcription results...")
bulk_results = get_bulk_transcriptions(run_ids)
if bulk_results:
print("β
Successfully fetched results. Starting analysis...")
# Step 2: Analyze all transcription results
print("\nπ Step 2: Analyzing transcriptions...")
analyze_bulk_transcriptions(bulk_results)
# Step 3: Export each transcription to a separate CSV file
print("\nπ Step 3: Exporting transcriptions to CSV files...")
for run_id in bulk_results:
transcript = safe_get_transcript(bulk_results.get(run_id, {}))
if transcript:
filename = f"transcription_run_{run_id}.csv"
export_transcription_to_csv(transcript, filename)
# Step 4: Find all speakers across all transcriptions
print("\nπ Step 4: Finding all speakers...")
all_speakers = extract_speakers_from_bulk(bulk_results)
print(f"π₯ All speakers across {len(bulk_results)} runs: {all_speakers}")
# Step 5: Analyze common vocabulary
print("\nπ Step 5: Analyzing common vocabulary...")
find_common_words(bulk_results)
print("\nπ Analysis complete! Check the CSV files for detailed transcription data.")
else:
print("β Failed to fetch transcription results. Please check your API key and run IDs.")