talks/2025-06-12-ai-safety-meetup/talk.tex

\documentclass[9pt]{beamer}%
\usetheme{Madrid}%
\usepackage[backend=biber, style=authoryear]{biblatex}%

\addbibresource{ai.bib} % Replace with your .bib file

\title{Measuring AI/LLM Capabilities and Progress}%
\author{Your Name}%
\date{June 2025}%

\begin{document}

\begin{frame}
  \titlepage
\end{frame}

% --- Slide: Motivation ---
\begin{frame}{Why Measure AI/LLM Capabilities?}
\begin{itemize}
\item \textbf{Rapid progress} in Large Language Models (LLMs) and AI systems.
\item \textbf{Need for robust, transparent benchmarks} to track abilities and
  risks.
\item \textbf{Inform policy, safety, and deployment decisions.}
\item \textbf{Ensure accountability and trust} in AI systems used by the public
  and organizations.
\item \textbf{Identify and mitigate biases and harmful behaviors} that may
  emerge in AI outputs.
\item \textbf{Guide investment and research priorities} by highlighting
  strengths and weaknesses.
\item \textbf{Support ethical AI development} through measurable standards and
  continuous improvement.
\item \textbf{Facilitate fair comparisons} between different models and
  approaches.
\end{itemize}
\end{frame}

% --- Slide: Frameworks for Measurement (Extended) ---
\begin{frame}{Frameworks for Measuring Progress}
\begin{itemize}
\item \textbf{Task Completion Benchmarks:} Assess ability to perform complex,
  multi-step tasks; widely used for model comparison and validation.
\item \textbf{Leaderboards:} Public, standardized comparisons across models;
  encourage transparency and drive innovation.
\item \textbf{Prediction Markets:} Aggregate expert and crowd forecasts on AI
  milestones; leverage collective intelligence for progress tracking.
\item \textbf{International Reports:} Comprehensive, cross-institutional
  tracking of AI progress and risks; provide global perspective and policy
  guidance.
\item \textbf{Standardized Evaluation Datasets:} Curated datasets for
  reproducible testing; ensure fair and consistent model assessment.
\item \textbf{User Feedback and Real-World Deployment:} Collect empirical data
  from real applications; highlight practical performance and user satisfaction.
\item \textbf{Longitudinal Studies:} Track AI system improvements over time;
  identify trends and inflection points in capability growth.
\item \textbf{Expert Panels and Peer Reviews:} Structured assessments by domain
  specialists; offer nuanced insights beyond automated metrics.
\end{itemize}
\end{frame}

% --- Slide: Task Completion Benchmarks ---
\begin{frame}{Task Completion Benchmarks}
\begin{itemize}
\item \textbf{Long-Horizon Task Evaluation:} METR's research measures LLMs'
  ability to complete extended, multi-step tasks, highlighting current
  limitations in reliability and autonomy \parencite{metr2025}. Recent results
  show that frontier models can autonomously complete tasks with a time horizon
  of about 40 minutes, but not yet work requiring days or weeks
  \parencite{metr2025,nature2025}.
\item \textbf{PowerPoint Task Completion (PPTC):} Evaluates LLMs on multi-turn,
  multi-modal instructions within PowerPoint, revealing challenges in tool use,
  error accumulation, and multi-modality \parencite{acl2024}. GPT-4 leads in
  performance, but all models struggle with non-text operations and long
  sessions \parencite{acl2024}.
\item \textbf{Long-Horizon Vision-Language Navigation (LHPR-VLN):} Benchmarks
  LLMs and specialized agents on multi-stage, complex navigation tasks. Most
  models fail on short subtasks; only fine-tuned or specialized agents (e.g.,
  MGDM) show partial success, emphasizing the importance of memory and holistic
  understanding in long tasks \parencite{arxiv-lhpr-vln}.
\item \textbf{Other Environments:}
  \begin{itemize}
  \item Robotics: SayCan uses LLMs to generate action sequences for robots.
  \item Web Navigation: WebShop assesses LLMs in e-commerce scenarios.
  \item Agent-Based Tasks: AgentBench evaluates LLMs as autonomous agents across
    8 diverse environments.
  \end{itemize}
\item \textbf{Key Insights:}
  \begin{itemize}
  \item Current LLMs excel at short, well-defined tasks but face reliability and
    error accumulation in long or complex workflows.
  \item Progress is exponential: model time horizons for task completion have
    been doubling every 7 months, suggesting rapid improvement
    \parencite{metr2025}.
  \item Economic impact remains limited by current task horizons and challenges
    in multi-modality and error handling.
  \end{itemize}
\end{itemize}
\end{frame}

% --- Slide: Leaderboards ---
\begin{frame}{Leaderboards}
\begin{itemize}
\item \textbf{Purpose:}
    \begin{itemize}
    \item Track and compare LLM performance across benchmarks.
    \item Provide a standardized way to evaluate model capabilities.
    \end{itemize}
  \item \textbf{Examples:}
    \begin{itemize}
    \item Gorilla APIBench Leaderboard \parencite{gorilla}
    \item Aider.chat Leaderboards \parencite{aider}
    \item Hugging Face Open LLM Leaderboard
    \item LMSYS Chatbot Arena
    \end{itemize}
  \item \textbf{Benefits:}
    \begin{itemize}
    \item Promote transparency and reproducibility in AI evaluation.
    \item Encourage healthy competition and rapid innovation.
    \item Help researchers and practitioners identify state-of-the-art models.
    \end{itemize}
  \item \textbf{Considerations:}
    \begin{itemize}
    \item Leaderboards may not capture all real-world use cases.
    \item Need to ensure benchmarks are robust, diverse, and up-to-date.
    \item Risk of overfitting to specific leaderboard metrics.
    \end{itemize}
\end{itemize}
\end{frame}

% --- Slide: Prediction Markets ---
\begin{frame}{Prediction Markets for AI Progress}
\begin{itemize}
\item Platforms like Metaculus and Polymarket aggregate forecasts on AI
  milestones \parencite{metaculus,polymarket}.
\item Useful for synthesizing expert and crowd expectations about future
  capabilities.
\item Complement empirical benchmarks with probabilistic insights.
\item Recent surge in activity: Major platforms have seen notable increases in
  trading volume and engagement, especially around high-profile events and
  technological milestones \parencite{polymarket,metaculus}.
\item Metaculus specializes in long-term, technology-focused questions, enabling
  nuanced tracking of progress in areas like quantum computing and advanced AI
  systems \parencite{metaculus}.
\item Polymarket and PredictIt demonstrate how prediction markets can reflect
  real-time shifts in collective expectations, sometimes diverging from
  traditional expert consensus \parencite{polymarket,predictit}.
\item AI-powered information aggregation is enhancing prediction markets,
  allowing for finer-grained, real-time analysis and more targeted event
  creation \parencite{hackernoon}.
\item Prediction markets can help identify emerging trends, inform policy, and
  guide strategic investments in AI by revealing where consensus and uncertainty
  lie.
\end{itemize}
\end{frame}

% --- Slide: International Tracking and Reports ---
\begin{frame}{International Tracking and Reports}
\begin{itemize}
\item \textbf{International AI Safety Report:}
\begin{itemize}
\item Annual, multi-stakeholder assessment of AI progress, risks, and governance
  \parencite{aisafety}.
\item Includes expert insights from academia, industry, and civil society.
\end{itemize}

\item \textbf{TrackingAI.org:}
\begin{itemize}
\item Centralized resource for tracking AI system capabilities and benchmarks
  \parencite{trackingai}.
\item Features interactive dashboards and regular updates.
\end{itemize}

\item \textbf{Emerging Initiatives:}
\begin{itemize}
\item Regional and international AI observatories (e.g., EU AI Observatory).
\item Collaborative databases for sharing best practices and incident reports.
\end{itemize}

\item \textbf{Key Benefits:}
\begin{itemize}
\item Facilitate global coordination and evidence-based policy.
\item Increase transparency and accountability in AI development.
\item Support proactive risk management and regulatory adaptation.
\end{itemize}
\end{itemize}
\end{frame}

% --- Slide: Challenges and Limitations ---
\begin{frame}{Challenges in Measuring AI Capabilities}
\begin{itemize}
\item \textbf{Multi-modality and real-world complexity remain difficult to
  benchmark} \parencite{acl2024}.
\begin{itemize}
\item Integrating text, images, audio, and video introduces interdependencies
  that are hard to isolate and measure.
\item Real-world scenarios often involve ambiguous or incomplete information,
  making standardized evaluation challenging.
\end{itemize}

\item \textbf{Error accumulation in long-horizon tasks.}
\begin{itemize}
\item As AI systems perform longer sequences of actions or reasoning steps,
  small errors can compound, leading to significant inaccuracies.
\item This makes it difficult to assess reliability over extended interactions
  or complex workflows.
\end{itemize}

\item \textbf{Subjective tasks (e.g., aesthetics) are hard to evaluate
  automatically.}
\begin{itemize}
\item Human judgment is often required for tasks involving creativity, style, or
  subjective quality.
\item Automated metrics may fail to capture nuances that are obvious to humans.
\end{itemize}

\item \textbf{Need for continual updates as models and tasks evolve.}
\begin{itemize}
\item Benchmarks quickly become outdated as new models and capabilities emerge.
\item Continuous adaptation of evaluation frameworks is necessary to keep pace
  with technological progress.
\end{itemize}

\item \textbf{Generalization across domains remains a key challenge.}
\begin{itemize}
\item Models often perform well on specific benchmarks but struggle to
  generalize to unseen or novel situations.
\item Ensuring robustness and adaptability in diverse environments is an ongoing
  research problem.
\end{itemize}
\end{itemize}
\end{frame}

% --- Slide: Conclusion ---
\begin{frame}{Conclusion}
\begin{itemize}
\item Measuring AI/LLM capabilities is essential for safe and effective
  deployment.
\item Combination of benchmarks, leaderboards, prediction markets, and
  international reports provides a holistic view.
\item Ongoing research is needed to address emerging challenges and ensure
  robust evaluation.
\item Collaboration among academia, industry, and policymakers is crucial for
  advancing evaluation methods.
\item Transparency in AI assessment processes builds public trust and supports
  informed decision-making.
\item Future directions should consider ethical implications and societal impact
  alongside technical performance.
\end{itemize}
\end{frame}

% --- Optional: Thank You Slide ---
\begin{frame}{Thank You}
\begin{center}
  Thank you for your attention!\ Questions?
\end{center}
\end{frame}

% --- Slide: References ---
\begin{frame}[allowframebreaks]{References}
  \printbibliography
\end{frame}

\end{document}
Add new talk 2025-06-15 18:34:11 +02:00			`\documentclass[9pt]{beamer}%`
			`\usetheme{Madrid}%`
			`\usepackage[backend=biber, style=authoryear]{biblatex}%`

			`\addbibresource{ai.bib} % Replace with your .bib file`

			`\title{Measuring AI/LLM Capabilities and Progress}%`
			`\author{Your Name}%`
			`\date{June 2025}%`

			`\begin{document}`

			`\begin{frame}`
			`\titlepage`
			`\end{frame}`

			`% --- Slide: Motivation ---`
			`\begin{frame}{Why Measure AI/LLM Capabilities?}`
			`\begin{itemize}`
			`\item \textbf{Rapid progress} in Large Language Models (LLMs) and AI systems.`
			`\item \textbf{Need for robust, transparent benchmarks} to track abilities and`
			`risks.`
			`\item \textbf{Inform policy, safety, and deployment decisions.}`
			`\item \textbf{Ensure accountability and trust} in AI systems used by the public`
			`and organizations.`
			`\item \textbf{Identify and mitigate biases and harmful behaviors} that may`
			`emerge in AI outputs.`
			`\item \textbf{Guide investment and research priorities} by highlighting`
			`strengths and weaknesses.`
			`\item \textbf{Support ethical AI development} through measurable standards and`
			`continuous improvement.`
			`\item \textbf{Facilitate fair comparisons} between different models and`
			`approaches.`
			`\end{itemize}`
			`\end{frame}`

			`% --- Slide: Frameworks for Measurement (Extended) ---`
			`\begin{frame}{Frameworks for Measuring Progress}`
			`\begin{itemize}`
			`\item \textbf{Task Completion Benchmarks:} Assess ability to perform complex,`
			`multi-step tasks; widely used for model comparison and validation.`
			`\item \textbf{Leaderboards:} Public, standardized comparisons across models;`
			`encourage transparency and drive innovation.`
			`\item \textbf{Prediction Markets:} Aggregate expert and crowd forecasts on AI`
			`milestones; leverage collective intelligence for progress tracking.`
			`\item \textbf{International Reports:} Comprehensive, cross-institutional`
			`tracking of AI progress and risks; provide global perspective and policy`
			`guidance.`
			`\item \textbf{Standardized Evaluation Datasets:} Curated datasets for`
			`reproducible testing; ensure fair and consistent model assessment.`
			`\item \textbf{User Feedback and Real-World Deployment:} Collect empirical data`
			`from real applications; highlight practical performance and user satisfaction.`
			`\item \textbf{Longitudinal Studies:} Track AI system improvements over time;`
			`identify trends and inflection points in capability growth.`
			`\item \textbf{Expert Panels and Peer Reviews:} Structured assessments by domain`
			`specialists; offer nuanced insights beyond automated metrics.`
			`\end{itemize}`
			`\end{frame}`

			`% --- Slide: Task Completion Benchmarks ---`
			`\begin{frame}{Task Completion Benchmarks}`
			`\begin{itemize}`
			`\item \textbf{Long-Horizon Task Evaluation:} METR's research measures LLMs'`
			`ability to complete extended, multi-step tasks, highlighting current`
			`limitations in reliability and autonomy \parencite{metr2025}. Recent results`
			`show that frontier models can autonomously complete tasks with a time horizon`
			`of about 40 minutes, but not yet work requiring days or weeks`
			`\parencite{metr2025,nature2025}.`
			`\item \textbf{PowerPoint Task Completion (PPTC):} Evaluates LLMs on multi-turn,`
			`multi-modal instructions within PowerPoint, revealing challenges in tool use,`
			`error accumulation, and multi-modality \parencite{acl2024}. GPT-4 leads in`
			`performance, but all models struggle with non-text operations and long`
			`sessions \parencite{acl2024}.`
			`\item \textbf{Long-Horizon Vision-Language Navigation (LHPR-VLN):} Benchmarks`
			`LLMs and specialized agents on multi-stage, complex navigation tasks. Most`
			`models fail on short subtasks; only fine-tuned or specialized agents (e.g.,`
			`MGDM) show partial success, emphasizing the importance of memory and holistic`
			`understanding in long tasks \parencite{arxiv-lhpr-vln}.`
			`\item \textbf{Other Environments:}`
			`\begin{itemize}`
			`\item Robotics: SayCan uses LLMs to generate action sequences for robots.`
			`\item Web Navigation: WebShop assesses LLMs in e-commerce scenarios.`
			`\item Agent-Based Tasks: AgentBench evaluates LLMs as autonomous agents across`
			`8 diverse environments.`
			`\end{itemize}`
			`\item \textbf{Key Insights:}`
			`\begin{itemize}`
			`\item Current LLMs excel at short, well-defined tasks but face reliability and`
			`error accumulation in long or complex workflows.`
			`\item Progress is exponential: model time horizons for task completion have`
			`been doubling every 7 months, suggesting rapid improvement`
			`\parencite{metr2025}.`
			`\item Economic impact remains limited by current task horizons and challenges`
			`in multi-modality and error handling.`
			`\end{itemize}`
			`\end{itemize}`
			`\end{frame}`

			`% --- Slide: Leaderboards ---`
			`\begin{frame}{Leaderboards}`
			`\begin{itemize}`
			`\item \textbf{Purpose:}`
			`\begin{itemize}`
			`\item Track and compare LLM performance across benchmarks.`
			`\item Provide a standardized way to evaluate model capabilities.`
			`\end{itemize}`
			`\item \textbf{Examples:}`
			`\begin{itemize}`
			`\item Gorilla APIBench Leaderboard \parencite{gorilla}`
			`\item Aider.chat Leaderboards \parencite{aider}`
			`\item Hugging Face Open LLM Leaderboard`
			`\item LMSYS Chatbot Arena`
			`\end{itemize}`
			`\item \textbf{Benefits:}`
			`\begin{itemize}`
			`\item Promote transparency and reproducibility in AI evaluation.`
			`\item Encourage healthy competition and rapid innovation.`
			`\item Help researchers and practitioners identify state-of-the-art models.`
			`\end{itemize}`
			`\item \textbf{Considerations:}`
			`\begin{itemize}`
			`\item Leaderboards may not capture all real-world use cases.`
			`\item Need to ensure benchmarks are robust, diverse, and up-to-date.`
			`\item Risk of overfitting to specific leaderboard metrics.`
			`\end{itemize}`
			`\end{itemize}`
			`\end{frame}`

			`% --- Slide: Prediction Markets ---`
			`\begin{frame}{Prediction Markets for AI Progress}`
			`\begin{itemize}`
			`\item Platforms like Metaculus and Polymarket aggregate forecasts on AI`
			`milestones \parencite{metaculus,polymarket}.`
			`\item Useful for synthesizing expert and crowd expectations about future`
			`capabilities.`
			`\item Complement empirical benchmarks with probabilistic insights.`
			`\item Recent surge in activity: Major platforms have seen notable increases in`
			`trading volume and engagement, especially around high-profile events and`
			`technological milestones \parencite{polymarket,metaculus}.`
			`\item Metaculus specializes in long-term, technology-focused questions, enabling`
			`nuanced tracking of progress in areas like quantum computing and advanced AI`
			`systems \parencite{metaculus}.`
			`\item Polymarket and PredictIt demonstrate how prediction markets can reflect`
			`real-time shifts in collective expectations, sometimes diverging from`
			`traditional expert consensus \parencite{polymarket,predictit}.`
			`\item AI-powered information aggregation is enhancing prediction markets,`
			`allowing for finer-grained, real-time analysis and more targeted event`
			`creation \parencite{hackernoon}.`
			`\item Prediction markets can help identify emerging trends, inform policy, and`
			`guide strategic investments in AI by revealing where consensus and uncertainty`
			`lie.`
			`\end{itemize}`
			`\end{frame}`

			`% --- Slide: International Tracking and Reports ---`
			`\begin{frame}{International Tracking and Reports}`
			`\begin{itemize}`
			`\item \textbf{International AI Safety Report:}`
			`\begin{itemize}`
			`\item Annual, multi-stakeholder assessment of AI progress, risks, and governance`
			`\parencite{aisafety}.`
			`\item Includes expert insights from academia, industry, and civil society.`
			`\end{itemize}`

			`\item \textbf{TrackingAI.org:}`
			`\begin{itemize}`
			`\item Centralized resource for tracking AI system capabilities and benchmarks`
			`\parencite{trackingai}.`
			`\item Features interactive dashboards and regular updates.`
			`\end{itemize}`

			`\item \textbf{Emerging Initiatives:}`
			`\begin{itemize}`
			`\item Regional and international AI observatories (e.g., EU AI Observatory).`
			`\item Collaborative databases for sharing best practices and incident reports.`
			`\end{itemize}`

			`\item \textbf{Key Benefits:}`
			`\begin{itemize}`
			`\item Facilitate global coordination and evidence-based policy.`
			`\item Increase transparency and accountability in AI development.`
			`\item Support proactive risk management and regulatory adaptation.`
			`\end{itemize}`
			`\end{itemize}`
			`\end{frame}`

			`% --- Slide: Challenges and Limitations ---`
			`\begin{frame}{Challenges in Measuring AI Capabilities}`
			`\begin{itemize}`
			`\item \textbf{Multi-modality and real-world complexity remain difficult to`
			`benchmark} \parencite{acl2024}.`
			`\begin{itemize}`
			`\item Integrating text, images, audio, and video introduces interdependencies`
			`that are hard to isolate and measure.`
			`\item Real-world scenarios often involve ambiguous or incomplete information,`
			`making standardized evaluation challenging.`
			`\end{itemize}`

			`\item \textbf{Error accumulation in long-horizon tasks.}`
			`\begin{itemize}`
			`\item As AI systems perform longer sequences of actions or reasoning steps,`
			`small errors can compound, leading to significant inaccuracies.`
			`\item This makes it difficult to assess reliability over extended interactions`
			`or complex workflows.`
			`\end{itemize}`

			`\item \textbf{Subjective tasks (e.g., aesthetics) are hard to evaluate`
			`automatically.}`
			`\begin{itemize}`
			`\item Human judgment is often required for tasks involving creativity, style, or`
			`subjective quality.`
			`\item Automated metrics may fail to capture nuances that are obvious to humans.`
			`\end{itemize}`

			`\item \textbf{Need for continual updates as models and tasks evolve.}`
			`\begin{itemize}`
			`\item Benchmarks quickly become outdated as new models and capabilities emerge.`
			`\item Continuous adaptation of evaluation frameworks is necessary to keep pace`
			`with technological progress.`
			`\end{itemize}`

			`\item \textbf{Generalization across domains remains a key challenge.}`
			`\begin{itemize}`
			`\item Models often perform well on specific benchmarks but struggle to`
			`generalize to unseen or novel situations.`
			`\item Ensuring robustness and adaptability in diverse environments is an ongoing`
			`research problem.`
			`\end{itemize}`
			`\end{itemize}`
			`\end{frame}`

			`% --- Slide: Conclusion ---`
			`\begin{frame}{Conclusion}`
			`\begin{itemize}`
			`\item Measuring AI/LLM capabilities is essential for safe and effective`
			`deployment.`
			`\item Combination of benchmarks, leaderboards, prediction markets, and`
			`international reports provides a holistic view.`
			`\item Ongoing research is needed to address emerging challenges and ensure`
			`robust evaluation.`
			`\item Collaboration among academia, industry, and policymakers is crucial for`
			`advancing evaluation methods.`
			`\item Transparency in AI assessment processes builds public trust and supports`
			`informed decision-making.`
			`\item Future directions should consider ethical implications and societal impact`
			`alongside technical performance.`
			`\end{itemize}`
			`\end{frame}`

			`% --- Optional: Thank You Slide ---`
			`\begin{frame}{Thank You}`
			`\begin{center}`
			`Thank you for your attention!\ Questions?`
			`\end{center}`
			`\end{frame}`

			`% --- Slide: References ---`
			`\begin{frame}[allowframebreaks]{References}`
			`\printbibliography`
			`\end{frame}`

			`\end{document}`