Skip to main content

scripts-biographical-researcher

#!/usr/bin/env python3 """

title: "Biographical Researcher" component_type: script version: "1.0.0" audience: contributor status: stable summary: "Biographical Researcher - Systematic biographical research automation" keywords: ['api', 'automation', 'biographical', 'generation', 'researcher'] tokens: ~500 created: 2025-12-22 updated: 2025-12-22 script_name: "biographical_researcher.py" language: python executable: true usage: "python3 scripts/biographical_researcher.py [options]" python_version: "3.10+" dependencies: [] modifies_files: false network_access: false requires_auth: false

Biographical Researcher - Systematic biographical research automation

Provides CLI and programmatic interface for executing biographical research using structured search methodology with multi-source validation.

Usage: python3 biographical_researcher.py --names "Name1, Name2" --context "Company X" python3 biographical_researcher.py --config research_config.json """

import argparse import json import sys from datetime import datetime from pathlib import Path from typing import List, Dict, Optional, Tuple from dataclasses import dataclass, asdict from enum import Enum

class Thoroughness(Enum): """Research thoroughness levels""" BASIC = "basic" # 15-20 min per person STANDARD = "standard" # 30-45 min per person COMPREHENSIVE = "comprehensive" # 45-60 min per person

class ValidationLevel(Enum): """Source validation requirements""" RELAXED = "relaxed" # 1+ source per claim STANDARD = "standard" # 2+ sources per claim STRICT = "strict" # 3+ sources per claim

class ConfidenceLevel(Enum): """Confidence in profile accuracy""" HIGH = "high" # 90%+ confidence MEDIUM = "medium" # 70-89% confidence LOW = "low" # 50-69% confidence

@dataclass class SearchQuery: """Structured search query""" phase: str query_template: str description: str

@dataclass class ResearchPhase: """Research phase definition""" name: str duration_minutes: int queries: List[SearchQuery] deliverables: List[str]

@dataclass class ProfileSection: """Section of biographical profile""" name: str content: str sources: List[str] confidence: ConfidenceLevel

@dataclass class BiographicalProfile: """Complete biographical profile""" name: str role: Optional[str] affiliation: Optional[str] location: Optional[str] education: List[Dict[str, str]] professional_background: List[Dict[str, str]] research: Optional[Dict[str, any]] investment_portfolio: Optional[Dict[str, any]] public_presence: Dict[str, str] company_connection: Optional[Dict[str, str]] notable_achievements: List[str] sources: List[str] confidence: ConfidenceLevel completeness: float information_gaps: List[str] research_date: str

def to_dict(self):
"""Convert to dictionary for JSON serialization"""
data = asdict(self)
data['confidence'] = self.confidence.value
return data

def to_markdown(self) -> str:
"""Convert to markdown format"""
md = []
md.append(f"## {self.name}\n")

if self.role:
md.append(f"**Role:** {self.role}")
if self.affiliation:
md.append(f"**Affiliation:** {self.affiliation}")
if self.location:
md.append(f"**Location:** {self.location}\n")

if self.education:
md.append("### Education")
for edu in self.education:
degree = edu.get('degree', '')
field = edu.get('field', '')
institution = edu.get('institution', '')
year = edu.get('year', '')
md.append(f"- {degree} in {field} - {institution} ({year})")
md.append("")

if self.professional_background:
md.append("### Professional Background")
for job in self.professional_background:
company = job.get('company', '')
role = job.get('role', '')
years = job.get('years', '')
md.append(f"- **{company}** - {role} ({years})")
if job.get('achievements'):
for achievement in job['achievements']:
md.append(f" - {achievement}")
md.append("")

if self.research:
md.append("### Research & Expertise")
if self.research.get('focus_areas'):
md.append(f"**Focus Areas:** {', '.join(self.research['focus_areas'])}\n")
if self.research.get('publications'):
md.append("**Notable Publications:**")
for i, pub in enumerate(self.research['publications'][:5], 1):
md.append(f"{i}. **{pub.get('title', '')}** ({pub.get('year', '')})")
md.append("")

if self.investment_portfolio:
md.append("### Investment Portfolio")
if self.investment_portfolio.get('focus'):
md.append(f"**Investment Focus:** {self.investment_portfolio['focus']}\n")
if self.investment_portfolio.get('notable_investments'):
md.append("**Notable Investments:**")
for inv in self.investment_portfolio['notable_investments']:
md.append(f"- {inv}")
md.append("")

if self.public_presence:
md.append("### Public Presence")
for platform, url in self.public_presence.items():
md.append(f"- **{platform.title()}:** {url}")
md.append("")

if self.notable_achievements:
md.append("### Notable Achievements")
for achievement in self.notable_achievements:
md.append(f"- {achievement}")
md.append("")

if self.company_connection:
md.append("### Company Connection")
for key, value in self.company_connection.items():
md.append(f"- **{key.replace('_', ' ').title()}:** {value}")
md.append("")

if self.sources:
md.append("### Sources")
for i, source in enumerate(self.sources, 1):
md.append(f"{i}. {source}")
md.append("")

# Metadata
md.append("---")
md.append(f"**Research Date:** {self.research_date}")
md.append(f"**Confidence:** {self.confidence.value.upper()}")
md.append(f"**Completeness:** {self.completeness:.0%}")
if self.information_gaps:
md.append(f"**Information Gaps:** {', '.join(self.information_gaps)}")
md.append("")

return "\n".join(md)

class BiographicalResearcher: """ Systematic biographical research system

Executes structured research methodology with:
- Multi-phase search strategy
- Source validation
- Profile compilation
- Quality assessment
"""

# Search query templates by phase
SEARCH_QUERIES = {
"initial_identification": [
SearchQuery("1", "{name} {context}", "Primary identification"),
SearchQuery("1", "{name} LinkedIn", "LinkedIn profile"),
SearchQuery("1", "{name} Twitter OR {name} X.com", "Social media"),
SearchQuery("1", "{name} {specialty}", "Specialty verification"),
],
"educational_background": [
SearchQuery("2", "{name} university", "University search"),
SearchQuery("2", "{name} PhD OR {name} Master's OR {name} degree", "Degree search"),
SearchQuery("2", "{name} {university}", "Specific university"),
SearchQuery("2", "{name} thesis OR {name} dissertation", "Academic work"),
],
"professional_experience": [
SearchQuery("3", "{name} worked at", "Work history"),
SearchQuery("3", "{name} CEO OR {name} CTO OR {name} founder", "Leadership roles"),
SearchQuery("3", "{name} career history", "Career progression"),
SearchQuery("3", "{name} resume OR {name} CV", "Resume search"),
],
"research_publications": [
SearchQuery("4", "{name} Google Scholar", "Google Scholar profile"),
SearchQuery("4", "{name} papers", "Research papers"),
SearchQuery("4", "{name} publications {field}", "Field-specific publications"),
SearchQuery("4", "{name} citations", "Citation metrics"),
],
"investment_history": [
SearchQuery("5", "{name} investor", "Investor profile"),
SearchQuery("5", "{name} portfolio", "Investment portfolio"),
SearchQuery("5", "{name} Crunchbase", "Crunchbase profile"),
SearchQuery("5", "{name} venture capital OR {name} angel investor", "Investment type"),
],
"public_presence": [
SearchQuery("6", "{name} interview", "Interviews"),
SearchQuery("6", "{name} podcast", "Podcast appearances"),
SearchQuery("6", "{name} blog OR {name} Substack", "Written content"),
SearchQuery("6", "{name} speaking OR {name} conference", "Public speaking"),
],
}

def __init__(self,
thoroughness: Thoroughness = Thoroughness.STANDARD,
validation: ValidationLevel = ValidationLevel.STANDARD,
confidence_threshold: ConfidenceLevel = ConfidenceLevel.MEDIUM):
"""
Initialize researcher

Args:
thoroughness: Research depth level
validation: Source validation requirements
confidence_threshold: Minimum confidence for profiles
"""
self.thoroughness = thoroughness
self.validation = validation
self.confidence_threshold = confidence_threshold

def generate_search_queries(self,
name: str,
context: str,
focus_areas: Optional[List[str]] = None) -> List[SearchQuery]:
"""
Generate search queries for individual

Args:
name: Person's name
context: Company/project context
focus_areas: Specific areas to emphasize

Returns:
List of search queries to execute
"""
queries = []

# Always do initial identification
for query in self.SEARCH_QUERIES["initial_identification"]:
q = query.query_template.format(
name=name,
context=context,
specialty=context.split()[0] if context else ""
)
queries.append(SearchQuery(query.phase, q, query.description))

# Educational background (all levels)
if not focus_areas or "education" in focus_areas:
for query in self.SEARCH_QUERIES["educational_background"]:
q = query.query_template.format(
name=name,
university="" # Will be filled in during research
)
queries.append(SearchQuery(query.phase, q, query.description))

# Professional experience (all levels)
if not focus_areas or "professional" in focus_areas or "background" in focus_areas:
for query in self.SEARCH_QUERIES["professional_experience"]:
q = query.query_template.format(name=name)
queries.append(SearchQuery(query.phase, q, query.description))

# Research (if comprehensive or specified)
if ((self.thoroughness == Thoroughness.COMPREHENSIVE or
(focus_areas and "publications" in focus_areas)) and
(not focus_areas or "research" in focus_areas)):
for query in self.SEARCH_QUERIES["research_publications"]:
q = query.query_template.format(
name=name,
field="" # Will be filled in
)
queries.append(SearchQuery(query.phase, q, query.description))

# Investments (if investor or specified)
if focus_areas and ("investments" in focus_areas or "portfolio" in focus_areas):
for query in self.SEARCH_QUERIES["investment_history"]:
q = query.query_template.format(name=name)
queries.append(SearchQuery(query.phase, q, query.description))

# Public presence (standard and comprehensive)
if self.thoroughness in [Thoroughness.STANDARD, Thoroughness.COMPREHENSIVE]:
for query in self.SEARCH_QUERIES["public_presence"]:
q = query.query_template.format(name=name)
queries.append(SearchQuery(query.phase, q, query.description))

return queries

def research_individual(self,
name: str,
context: str,
focus_areas: Optional[List[str]] = None) -> BiographicalProfile:
"""
Research single individual

Args:
name: Person's name
context: Company/project context
focus_areas: Specific areas to emphasize

Returns:
Complete biographical profile

Note:
This method generates search queries but does NOT execute web searches.
Web searches must be executed by calling agent (biographical-researcher)
which has access to WebSearch tool.

This script provides:
1. Search query generation
2. Profile structure
3. Validation framework
4. Output formatting
"""
queries = self.generate_search_queries(name, context, focus_areas)

# Return profile structure with queries
# Actual research executed by agent with WebSearch capability
profile = BiographicalProfile(
name=name,
role=None,
affiliation=context,
location=None,
education=[],
professional_background=[],
research=None,
investment_portfolio=None,
public_presence={},
company_connection={"context": context},
notable_achievements=[],
sources=[],
confidence=ConfidenceLevel.MEDIUM,
completeness=0.0,
information_gaps=[],
research_date=datetime.now().strftime("%Y-%m-%d")
)

return profile

def validate_profile(self, profile: BiographicalProfile) -> Tuple[bool, List[str]]:
"""
Validate profile quality

Args:
profile: Profile to validate

Returns:
(is_valid, list of validation issues)
"""
issues = []

# Check confidence threshold
confidence_levels = {
ConfidenceLevel.HIGH: 3,
ConfidenceLevel.MEDIUM: 2,
ConfidenceLevel.LOW: 1
}
if confidence_levels[profile.confidence] < confidence_levels[self.confidence_threshold]:
issues.append(f"Confidence {profile.confidence.value} below threshold {self.confidence_threshold.value}")

# Check completeness
if profile.completeness < 0.6:
issues.append(f"Completeness {profile.completeness:.0%} below minimum 60%")

# Check source count
min_sources = {
ValidationLevel.RELAXED: 1,
ValidationLevel.STANDARD: 2,
ValidationLevel.STRICT: 3
}
if len(profile.sources) < min_sources[self.validation] * 2: # * 2 for overall profile
issues.append(f"Source count {len(profile.sources)} below minimum {min_sources[self.validation] * 2}")

# Check required sections
if not profile.education:
issues.append("Missing education section")
if not profile.professional_background:
issues.append("Missing professional background")

return len(issues) == 0, issues

def calculate_completeness(self, profile: BiographicalProfile) -> float:
"""
Calculate profile completeness score

Args:
profile: Profile to assess

Returns:
Completeness score (0.0-1.0)
"""
sections = [
bool(profile.role),
bool(profile.affiliation),
bool(profile.location),
bool(profile.education),
bool(profile.professional_background),
bool(profile.public_presence),
bool(profile.sources and len(profile.sources) >= 3),
]

# Optional sections (don't penalize if missing)
if profile.research is not None:
sections.append(True)
if profile.investment_portfolio is not None:
sections.append(True)

return sum(sections) / len(sections)

def generate_search_strategy_doc(names: List[str], context: str, output: Path): """ Generate search strategy document

Args:
names: List of names to research
context: Research context
output: Output file path
"""
doc = []
doc.append(f"# Biographical Research Strategy\n")
doc.append(f"**Context:** {context}")
doc.append(f"**Date:** {datetime.now().strftime('%Y-%m-%d')}")
doc.append(f"**Individuals:** {len(names)}\n")
doc.append("---\n")

researcher = BiographicalResearcher()

for name in names:
doc.append(f"## {name}\n")
queries = researcher.generate_search_queries(name, context)

# Group by phase
phases = {}
for query in queries:
if query.phase not in phases:
phases[query.phase] = []
phases[query.phase].append(query)

for phase_num in sorted(phases.keys()):
doc.append(f"### Phase {phase_num}\n")
for query in phases[phase_num]:
doc.append(f"**{query.description}:**")
doc.append(f"```")
doc.append(query.query_template)
doc.append(f"```\n")

doc.append("---\n")

output.write_text("\n".join(doc))
print(f"Search strategy written to {output}")

def main(): """Main CLI entry point""" parser = argparse.ArgumentParser( description="Biographical Research Automation", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples:

Generate search strategy

python3 biographical_researcher.py --names "Name1, Name2" --context "Company X" --strategy-only

Research team (requires agent invocation for web search)

python3 biographical_researcher.py --names "Name1, Name2" --context "Company X" --output team.md """ )

# Input options
parser.add_argument("--names", required=True, help="Comma-separated list of names")
parser.add_argument("--context", required=True, help="Company/project context")
parser.add_argument("--focus", help="Comma-separated focus areas")

# Research options
parser.add_argument("--thoroughness", choices=["basic", "standard", "comprehensive"],
default="standard", help="Research thoroughness level")
parser.add_argument("--validate", choices=["relaxed", "standard", "strict"],
default="standard", help="Validation level")
parser.add_argument("--confidence-threshold", choices=["low", "medium", "high"],
default="medium", help="Minimum confidence threshold")

# Output options
parser.add_argument("--output", "-o", help="Output file path")
parser.add_argument("--format", choices=["markdown", "json"], default="markdown",
help="Output format")
parser.add_argument("--strategy-only", action="store_true",
help="Generate search strategy document only (no web research)")

args = parser.parse_args()

# Parse inputs
names = [name.strip() for name in args.names.split(",")]
focus_areas = [area.strip() for area in args.focus.split(",")] if args.focus else None

# Initialize researcher
researcher = BiographicalResearcher(
thoroughness=Thoroughness(args.thoroughness),
validation=ValidationLevel(args.validate),
confidence_threshold=ConfidenceLevel(args.confidence_threshold)
)

# Determine output path
if args.output:
output_path = Path(args.output)
else:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
ext = ".md" if args.format == "markdown" else ".json"
output_path = Path(f"biographical_research_{timestamp}{ext}")

# Strategy-only mode
if args.strategy_only:
generate_search_strategy_doc(names, args.context, output_path)
return

# Full research mode
print("⚠️ NOTE: This script generates search queries but does NOT execute web searches.")
print(" Web research requires agent invocation with WebSearch capability.")
print()
print(" To execute full research, use:")
print(f" /research-bio --names \"{args.names}\" --context \"{args.context}\"")
print()
print(" This script provides:")
print(" - Search query generation ✓")
print(" - Profile structure ✓")
print(" - Validation framework ✓")
print(" - Output formatting ✓")
print()

# Generate profiles (structure only, no data)
profiles = []
for name in names:
profile = researcher.research_individual(name, args.context, focus_areas)
profiles.append(profile)

# Output
if args.format == "json":
output_data = {
"research_date": datetime.now().strftime("%Y-%m-%d"),
"context": args.context,
"thoroughness": args.thoroughness,
"validation_level": args.validate,
"profiles": [p.to_dict() for p in profiles]
}
output_path.write_text(json.dumps(output_data, indent=2))
else:
md_content = []
md_content.append(f"# {args.context} - Biographical Profiles\n")
md_content.append(f"**Research Date:** {datetime.now().strftime('%Y-%m-%d')}")
md_content.append(f"**Thoroughness:** {args.thoroughness}")
md_content.append(f"**Validation:** {args.validate}\n")
md_content.append("---\n")

for profile in profiles:
md_content.append(profile.to_markdown())
md_content.append("---\n")

output_path.write_text("\n".join(md_content))

print(f"✓ Profile structure written to {output_path}")
print(f"✓ Generated search queries for {len(names)} individuals")
print()
print("Next step: Execute research via agent:")
print(f"/research-bio --names \"{args.names}\" --context \"{args.context}\" --output {output_path}")

if name == "main": main()