Skip to main content

Scenario

How accurate are Mavera’s synthetic personas compared to real humans? This job creates the same set of questions in both Typeform (for real respondents) and a Mavera Focus Group (for synthetic personas), then compares the results. The output is calibration data: where synthetic matches reality (and where it diverges), so you know which research questions you can confidently delegate to Mavera and which still need human validation. Flow: Design questions → Deploy as Typeform survey + Mavera Focus Group → Collect real + synthetic responses → Compare themes, sentiment, rankings → Calibration report

Architecture

Code

import os, json, requests, time
from collections import Counter

TF = os.environ["TYPEFORM_TOKEN"]
MV = os.environ["MAVERA_API_KEY"]
TF_BASE = "https://api.typeform.com"
MB = "https://app.mavera.io/api/v1"
TF_H = {"Authorization": f"Bearer {TF}"}
MV_H = {"Authorization": f"Bearer {MV}", "Content-Type": "application/json"}

TYPEFORM_FORM_ID = os.environ.get("CALIBRATION_FORM_ID", "your_form_id")
PERSONA_IDS = os.environ.get("PERSONA_IDS", "").split(",")

SHARED_QUESTIONS = [
    "What is the biggest challenge you face in your role today?",
    "How would you describe our product to a colleague in one sentence?",
    "What nearly stopped you from becoming a customer?",
    "Rate your onboarding experience (1-10) and explain why.",
    "If you could change one thing about our product, what would it be?",
]

# 1. Pull Typeform responses (assumes form already deployed with these questions)
form = requests.get(f"{TF_BASE}/forms/{TYPEFORM_FORM_ID}", headers=TF_H).json()
field_map = {}
for f in form.get("fields", []):
    for q in SHARED_QUESTIONS:
        if q.lower()[:30] in f.get("title", "").lower():
            field_map[f["id"]] = q
            break

responses_raw = []
params = {"page_size": 1000}
while True:
    r = requests.get(f"{TF_BASE}/forms/{TYPEFORM_FORM_ID}/responses",
        headers=TF_H, params=params)
    if r.status_code == 429:
        time.sleep(1); continue
    r.raise_for_status()
    data = r.json()
    responses_raw.extend(data.get("items", []))
    if len(data.get("items", [])) < 1000: break
    params["before"] = data["items"][-1]["token"]
    time.sleep(0.6)

# Parse real responses
real_answers = {q: [] for q in SHARED_QUESTIONS}
for resp in responses_raw:
    for ans in resp.get("answers", []):
        fid = ans.get("field", {}).get("id", "")
        if fid in field_map:
            text = ans.get("text", "") or str(ans.get("number", ""))
            if text.strip():
                real_answers[field_map[fid]].append(text.strip())

print(f"Real responses: {len(responses_raw)}")
for q, answers in real_answers.items():
    print(f"  {q[:50]}... → {len(answers)} answers")

# 2. Run Mavera Focus Group with same questions
if not PERSONA_IDS or PERSONA_IDS == [""]:
    p = requests.post(f"{MB}/personas", headers=MV_H, json={
        "name": "Calibration Persona",
        "description": "General target customer for A/B calibration study.",
    }).json()
    PERSONA_IDS = [p["id"]]

fg = requests.post(f"{MB}/focus-groups", headers=MV_H, json={
    "name": "Calibration: Typeform vs Mavera",
    "persona_ids": PERSONA_IDS,
    "questions": SHARED_QUESTIONS,
    "context": "Answer these questions as you genuinely would. Be specific and honest.",
    "responses_per_persona": 5,
}).json()

for _ in range(20):
    time.sleep(5)
    fg_result = requests.get(f"{MB}/focus-groups/{fg['id']}",
        headers=MV_H).json()
    if fg_result.get("status") == "completed":
        break

# Parse synthetic responses
synthetic_answers = {q: [] for q in SHARED_QUESTIONS}
for resp in fg_result.get("responses", []):
    q_text = resp.get("question", "")
    for q in SHARED_QUESTIONS:
        if q.lower()[:30] in q_text.lower():
            synthetic_answers[q].append(resp.get("answer", ""))
            break

# 3. Compare with Mave
comparison_prompt = "Compare real human survey responses vs synthetic persona responses.\n\n"
for q in SHARED_QUESTIONS:
    real = real_answers.get(q, [])[:10]
    synth = synthetic_answers.get(q, [])[:5]
    comparison_prompt += f"\n**Question:** {q}\n"
    comparison_prompt += f"REAL ({len(real_answers.get(q, []))} total): {'; '.join(r[:100] for r in real[:5])}\n"
    comparison_prompt += f"SYNTHETIC ({len(synth)}): {'; '.join(s[:100] for s in synth[:3])}\n"

comparison_prompt += """

Analyze:
1) Theme overlap: What % of real themes appear in synthetic responses?
2) Sentiment alignment: Do synthetic responses match the emotional tone?
3) Specificity gap: Are real responses more/less specific?
4) Blind spots: What did real humans say that synthetics completely missed?
5) Synthetic advantages: Where did synthetics provide insight humans didn't?
6) Calibration score (0-100): How reliable is synthetic for this question set?
7) Recommendations: Which question types are safe to delegate to synthetic?"""

comparison = requests.post(f"{MB}/mave/chat", headers=MV_H,
    json={"message": comparison_prompt}).json()

print(f"\n{'='*60}")
print("CALIBRATION REPORT: Typeform (Real) vs Mavera (Synthetic)")
print(f"{'='*60}")
print(comparison.get("content", "")[:3000])

Example Output

============================================================
CALIBRATION REPORT: Typeform (Real) vs Mavera (Synthetic)
============================================================