263 lines
11 KiB
TeX
263 lines
11 KiB
TeX
|
\documentclass[9pt]{beamer}%
|
||
|
\usetheme{Madrid}%
|
||
|
\usepackage[backend=biber, style=authoryear]{biblatex}%
|
||
|
|
||
|
\addbibresource{ai.bib} % Replace with your .bib file
|
||
|
|
||
|
\title{Measuring AI/LLM Capabilities and Progress}%
|
||
|
\author{Your Name}%
|
||
|
\date{June 2025}%
|
||
|
|
||
|
\begin{document}
|
||
|
|
||
|
\begin{frame}
|
||
|
\titlepage
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: Motivation ---
|
||
|
\begin{frame}{Why Measure AI/LLM Capabilities?}
|
||
|
\begin{itemize}
|
||
|
\item \textbf{Rapid progress} in Large Language Models (LLMs) and AI systems.
|
||
|
\item \textbf{Need for robust, transparent benchmarks} to track abilities and
|
||
|
risks.
|
||
|
\item \textbf{Inform policy, safety, and deployment decisions.}
|
||
|
\item \textbf{Ensure accountability and trust} in AI systems used by the public
|
||
|
and organizations.
|
||
|
\item \textbf{Identify and mitigate biases and harmful behaviors} that may
|
||
|
emerge in AI outputs.
|
||
|
\item \textbf{Guide investment and research priorities} by highlighting
|
||
|
strengths and weaknesses.
|
||
|
\item \textbf{Support ethical AI development} through measurable standards and
|
||
|
continuous improvement.
|
||
|
\item \textbf{Facilitate fair comparisons} between different models and
|
||
|
approaches.
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: Frameworks for Measurement (Extended) ---
|
||
|
\begin{frame}{Frameworks for Measuring Progress}
|
||
|
\begin{itemize}
|
||
|
\item \textbf{Task Completion Benchmarks:} Assess ability to perform complex,
|
||
|
multi-step tasks; widely used for model comparison and validation.
|
||
|
\item \textbf{Leaderboards:} Public, standardized comparisons across models;
|
||
|
encourage transparency and drive innovation.
|
||
|
\item \textbf{Prediction Markets:} Aggregate expert and crowd forecasts on AI
|
||
|
milestones; leverage collective intelligence for progress tracking.
|
||
|
\item \textbf{International Reports:} Comprehensive, cross-institutional
|
||
|
tracking of AI progress and risks; provide global perspective and policy
|
||
|
guidance.
|
||
|
\item \textbf{Standardized Evaluation Datasets:} Curated datasets for
|
||
|
reproducible testing; ensure fair and consistent model assessment.
|
||
|
\item \textbf{User Feedback and Real-World Deployment:} Collect empirical data
|
||
|
from real applications; highlight practical performance and user satisfaction.
|
||
|
\item \textbf{Longitudinal Studies:} Track AI system improvements over time;
|
||
|
identify trends and inflection points in capability growth.
|
||
|
\item \textbf{Expert Panels and Peer Reviews:} Structured assessments by domain
|
||
|
specialists; offer nuanced insights beyond automated metrics.
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: Task Completion Benchmarks ---
|
||
|
\begin{frame}{Task Completion Benchmarks}
|
||
|
\begin{itemize}
|
||
|
\item \textbf{Long-Horizon Task Evaluation:} METR's research measures LLMs'
|
||
|
ability to complete extended, multi-step tasks, highlighting current
|
||
|
limitations in reliability and autonomy \parencite{metr2025}. Recent results
|
||
|
show that frontier models can autonomously complete tasks with a time horizon
|
||
|
of about 40 minutes, but not yet work requiring days or weeks
|
||
|
\parencite{metr2025,nature2025}.
|
||
|
\item \textbf{PowerPoint Task Completion (PPTC):} Evaluates LLMs on multi-turn,
|
||
|
multi-modal instructions within PowerPoint, revealing challenges in tool use,
|
||
|
error accumulation, and multi-modality \parencite{acl2024}. GPT-4 leads in
|
||
|
performance, but all models struggle with non-text operations and long
|
||
|
sessions \parencite{acl2024}.
|
||
|
\item \textbf{Long-Horizon Vision-Language Navigation (LHPR-VLN):} Benchmarks
|
||
|
LLMs and specialized agents on multi-stage, complex navigation tasks. Most
|
||
|
models fail on short subtasks; only fine-tuned or specialized agents (e.g.,
|
||
|
MGDM) show partial success, emphasizing the importance of memory and holistic
|
||
|
understanding in long tasks \parencite{arxiv-lhpr-vln}.
|
||
|
\item \textbf{Other Environments:}
|
||
|
\begin{itemize}
|
||
|
\item Robotics: SayCan uses LLMs to generate action sequences for robots.
|
||
|
\item Web Navigation: WebShop assesses LLMs in e-commerce scenarios.
|
||
|
\item Agent-Based Tasks: AgentBench evaluates LLMs as autonomous agents across
|
||
|
8 diverse environments.
|
||
|
\end{itemize}
|
||
|
\item \textbf{Key Insights:}
|
||
|
\begin{itemize}
|
||
|
\item Current LLMs excel at short, well-defined tasks but face reliability and
|
||
|
error accumulation in long or complex workflows.
|
||
|
\item Progress is exponential: model time horizons for task completion have
|
||
|
been doubling every 7 months, suggesting rapid improvement
|
||
|
\parencite{metr2025}.
|
||
|
\item Economic impact remains limited by current task horizons and challenges
|
||
|
in multi-modality and error handling.
|
||
|
\end{itemize}
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: Leaderboards ---
|
||
|
\begin{frame}{Leaderboards}
|
||
|
\begin{itemize}
|
||
|
\item \textbf{Purpose:}
|
||
|
\begin{itemize}
|
||
|
\item Track and compare LLM performance across benchmarks.
|
||
|
\item Provide a standardized way to evaluate model capabilities.
|
||
|
\end{itemize}
|
||
|
\item \textbf{Examples:}
|
||
|
\begin{itemize}
|
||
|
\item Gorilla APIBench Leaderboard \parencite{gorilla}
|
||
|
\item Aider.chat Leaderboards \parencite{aider}
|
||
|
\item Hugging Face Open LLM Leaderboard
|
||
|
\item LMSYS Chatbot Arena
|
||
|
\end{itemize}
|
||
|
\item \textbf{Benefits:}
|
||
|
\begin{itemize}
|
||
|
\item Promote transparency and reproducibility in AI evaluation.
|
||
|
\item Encourage healthy competition and rapid innovation.
|
||
|
\item Help researchers and practitioners identify state-of-the-art models.
|
||
|
\end{itemize}
|
||
|
\item \textbf{Considerations:}
|
||
|
\begin{itemize}
|
||
|
\item Leaderboards may not capture all real-world use cases.
|
||
|
\item Need to ensure benchmarks are robust, diverse, and up-to-date.
|
||
|
\item Risk of overfitting to specific leaderboard metrics.
|
||
|
\end{itemize}
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: Prediction Markets ---
|
||
|
\begin{frame}{Prediction Markets for AI Progress}
|
||
|
\begin{itemize}
|
||
|
\item Platforms like Metaculus and Polymarket aggregate forecasts on AI
|
||
|
milestones \parencite{metaculus,polymarket}.
|
||
|
\item Useful for synthesizing expert and crowd expectations about future
|
||
|
capabilities.
|
||
|
\item Complement empirical benchmarks with probabilistic insights.
|
||
|
\item Recent surge in activity: Major platforms have seen notable increases in
|
||
|
trading volume and engagement, especially around high-profile events and
|
||
|
technological milestones \parencite{polymarket,metaculus}.
|
||
|
\item Metaculus specializes in long-term, technology-focused questions, enabling
|
||
|
nuanced tracking of progress in areas like quantum computing and advanced AI
|
||
|
systems \parencite{metaculus}.
|
||
|
\item Polymarket and PredictIt demonstrate how prediction markets can reflect
|
||
|
real-time shifts in collective expectations, sometimes diverging from
|
||
|
traditional expert consensus \parencite{polymarket,predictit}.
|
||
|
\item AI-powered information aggregation is enhancing prediction markets,
|
||
|
allowing for finer-grained, real-time analysis and more targeted event
|
||
|
creation \parencite{hackernoon}.
|
||
|
\item Prediction markets can help identify emerging trends, inform policy, and
|
||
|
guide strategic investments in AI by revealing where consensus and uncertainty
|
||
|
lie.
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: International Tracking and Reports ---
|
||
|
\begin{frame}{International Tracking and Reports}
|
||
|
\begin{itemize}
|
||
|
\item \textbf{International AI Safety Report:}
|
||
|
\begin{itemize}
|
||
|
\item Annual, multi-stakeholder assessment of AI progress, risks, and governance
|
||
|
\parencite{aisafety}.
|
||
|
\item Includes expert insights from academia, industry, and civil society.
|
||
|
\end{itemize}
|
||
|
|
||
|
\item \textbf{TrackingAI.org:}
|
||
|
\begin{itemize}
|
||
|
\item Centralized resource for tracking AI system capabilities and benchmarks
|
||
|
\parencite{trackingai}.
|
||
|
\item Features interactive dashboards and regular updates.
|
||
|
\end{itemize}
|
||
|
|
||
|
\item \textbf{Emerging Initiatives:}
|
||
|
\begin{itemize}
|
||
|
\item Regional and international AI observatories (e.g., EU AI Observatory).
|
||
|
\item Collaborative databases for sharing best practices and incident reports.
|
||
|
\end{itemize}
|
||
|
|
||
|
\item \textbf{Key Benefits:}
|
||
|
\begin{itemize}
|
||
|
\item Facilitate global coordination and evidence-based policy.
|
||
|
\item Increase transparency and accountability in AI development.
|
||
|
\item Support proactive risk management and regulatory adaptation.
|
||
|
\end{itemize}
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: Challenges and Limitations ---
|
||
|
\begin{frame}{Challenges in Measuring AI Capabilities}
|
||
|
\begin{itemize}
|
||
|
\item \textbf{Multi-modality and real-world complexity remain difficult to
|
||
|
benchmark} \parencite{acl2024}.
|
||
|
\begin{itemize}
|
||
|
\item Integrating text, images, audio, and video introduces interdependencies
|
||
|
that are hard to isolate and measure.
|
||
|
\item Real-world scenarios often involve ambiguous or incomplete information,
|
||
|
making standardized evaluation challenging.
|
||
|
\end{itemize}
|
||
|
|
||
|
\item \textbf{Error accumulation in long-horizon tasks.}
|
||
|
\begin{itemize}
|
||
|
\item As AI systems perform longer sequences of actions or reasoning steps,
|
||
|
small errors can compound, leading to significant inaccuracies.
|
||
|
\item This makes it difficult to assess reliability over extended interactions
|
||
|
or complex workflows.
|
||
|
\end{itemize}
|
||
|
|
||
|
\item \textbf{Subjective tasks (e.g., aesthetics) are hard to evaluate
|
||
|
automatically.}
|
||
|
\begin{itemize}
|
||
|
\item Human judgment is often required for tasks involving creativity, style, or
|
||
|
subjective quality.
|
||
|
\item Automated metrics may fail to capture nuances that are obvious to humans.
|
||
|
\end{itemize}
|
||
|
|
||
|
\item \textbf{Need for continual updates as models and tasks evolve.}
|
||
|
\begin{itemize}
|
||
|
\item Benchmarks quickly become outdated as new models and capabilities emerge.
|
||
|
\item Continuous adaptation of evaluation frameworks is necessary to keep pace
|
||
|
with technological progress.
|
||
|
\end{itemize}
|
||
|
|
||
|
\item \textbf{Generalization across domains remains a key challenge.}
|
||
|
\begin{itemize}
|
||
|
\item Models often perform well on specific benchmarks but struggle to
|
||
|
generalize to unseen or novel situations.
|
||
|
\item Ensuring robustness and adaptability in diverse environments is an ongoing
|
||
|
research problem.
|
||
|
\end{itemize}
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: Conclusion ---
|
||
|
\begin{frame}{Conclusion}
|
||
|
\begin{itemize}
|
||
|
\item Measuring AI/LLM capabilities is essential for safe and effective
|
||
|
deployment.
|
||
|
\item Combination of benchmarks, leaderboards, prediction markets, and
|
||
|
international reports provides a holistic view.
|
||
|
\item Ongoing research is needed to address emerging challenges and ensure
|
||
|
robust evaluation.
|
||
|
\item Collaboration among academia, industry, and policymakers is crucial for
|
||
|
advancing evaluation methods.
|
||
|
\item Transparency in AI assessment processes builds public trust and supports
|
||
|
informed decision-making.
|
||
|
\item Future directions should consider ethical implications and societal impact
|
||
|
alongside technical performance.
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Optional: Thank You Slide ---
|
||
|
\begin{frame}{Thank You}
|
||
|
\begin{center}
|
||
|
Thank you for your attention!\ Questions?
|
||
|
\end{center}
|
||
|
\end{frame}
|
||
|
|
||
|
% --- Slide: References ---
|
||
|
\begin{frame}[allowframebreaks]{References}
|
||
|
\printbibliography
|
||
|
\end{frame}
|
||
|
|
||
|
\end{document}
|