Add new talk

This commit is contained in:
rfl 2025-06-15 18:34:11 +02:00
parent b7eaf80617
commit 8272a1639e
Signed by: rfl
GPG key ID: 48B0E5DDF8FA62EF
4 changed files with 327 additions and 0 deletions

View file

@ -0,0 +1 @@
# this talk was almost entirely generated by the perplexity llm

View file

@ -0,0 +1,64 @@
@misc{metr2025,
author = {METR},
title = {Measuring AI Ability to Complete Long Tasks},
year = {2025},
url = {https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/}
}
@article{acl2024,
author = {Author(s) of ACL Paper}, % Replace with actual author(s)
title = {PowerPoint Task Completion: Evaluating LLMs on Multi-Turn, Multi-Modal Instructions},
year = {2024},
journal = {Findings of the Association for Computational Linguistics},
note = {ACL 2024},
url = {https://aclanthology.org/2024.findings-acl.514.pdf}
}
@misc{gorilla,
author = {Gorilla Team},
title = {Gorilla APIBench Leaderboard},
year = {2025},
url = {https://gorilla.cs.berkeley.edu/leaderboard.html}
}
@misc{aider,
author = {Aider Team},
title = {Aider.chat Leaderboards},
year = {2025},
url = {https://aider.chat/docs/leaderboards/}
}
@misc{metaculus,
author = {Metaculus},
title = {Metaculus: AI Prediction Markets},
year = {2025},
url = {https://www.metaculus.com/questions/?topic=ai}
}
@misc{predictit,
author = {PredictIt},
title = {PredictIt Markets},
year = {2025},
url = {https://www.predictit.org/}
}
@misc{polymarket,
author = {Polymarket},
title = {Polymarket: AI Markets},
year = {2025},
url = {https://polymarket.com/markets/all/ai}
}
@techreport{aisafety,
author = {International AI Safety Report Team},
title = {International AI Safety Report},
year = {2025},
institution = {International AI Safety Report},
url = {https://arxiv.org/abs/2501.17805}
}
@misc{trackingai,
author = {TrackingAI Team},
title = {TrackingAI.org},
year = {2025},
url = {https://trackingai.org/home}
}

Binary file not shown.

View file

@ -0,0 +1,262 @@
\documentclass[9pt]{beamer}%
\usetheme{Madrid}%
\usepackage[backend=biber, style=authoryear]{biblatex}%
\addbibresource{ai.bib} % Replace with your .bib file
\title{Measuring AI/LLM Capabilities and Progress}%
\author{Your Name}%
\date{June 2025}%
\begin{document}
\begin{frame}
\titlepage
\end{frame}
% --- Slide: Motivation ---
\begin{frame}{Why Measure AI/LLM Capabilities?}
\begin{itemize}
\item \textbf{Rapid progress} in Large Language Models (LLMs) and AI systems.
\item \textbf{Need for robust, transparent benchmarks} to track abilities and
risks.
\item \textbf{Inform policy, safety, and deployment decisions.}
\item \textbf{Ensure accountability and trust} in AI systems used by the public
and organizations.
\item \textbf{Identify and mitigate biases and harmful behaviors} that may
emerge in AI outputs.
\item \textbf{Guide investment and research priorities} by highlighting
strengths and weaknesses.
\item \textbf{Support ethical AI development} through measurable standards and
continuous improvement.
\item \textbf{Facilitate fair comparisons} between different models and
approaches.
\end{itemize}
\end{frame}
% --- Slide: Frameworks for Measurement (Extended) ---
\begin{frame}{Frameworks for Measuring Progress}
\begin{itemize}
\item \textbf{Task Completion Benchmarks:} Assess ability to perform complex,
multi-step tasks; widely used for model comparison and validation.
\item \textbf{Leaderboards:} Public, standardized comparisons across models;
encourage transparency and drive innovation.
\item \textbf{Prediction Markets:} Aggregate expert and crowd forecasts on AI
milestones; leverage collective intelligence for progress tracking.
\item \textbf{International Reports:} Comprehensive, cross-institutional
tracking of AI progress and risks; provide global perspective and policy
guidance.
\item \textbf{Standardized Evaluation Datasets:} Curated datasets for
reproducible testing; ensure fair and consistent model assessment.
\item \textbf{User Feedback and Real-World Deployment:} Collect empirical data
from real applications; highlight practical performance and user satisfaction.
\item \textbf{Longitudinal Studies:} Track AI system improvements over time;
identify trends and inflection points in capability growth.
\item \textbf{Expert Panels and Peer Reviews:} Structured assessments by domain
specialists; offer nuanced insights beyond automated metrics.
\end{itemize}
\end{frame}
% --- Slide: Task Completion Benchmarks ---
\begin{frame}{Task Completion Benchmarks}
\begin{itemize}
\item \textbf{Long-Horizon Task Evaluation:} METR's research measures LLMs'
ability to complete extended, multi-step tasks, highlighting current
limitations in reliability and autonomy \parencite{metr2025}. Recent results
show that frontier models can autonomously complete tasks with a time horizon
of about 40 minutes, but not yet work requiring days or weeks
\parencite{metr2025,nature2025}.
\item \textbf{PowerPoint Task Completion (PPTC):} Evaluates LLMs on multi-turn,
multi-modal instructions within PowerPoint, revealing challenges in tool use,
error accumulation, and multi-modality \parencite{acl2024}. GPT-4 leads in
performance, but all models struggle with non-text operations and long
sessions \parencite{acl2024}.
\item \textbf{Long-Horizon Vision-Language Navigation (LHPR-VLN):} Benchmarks
LLMs and specialized agents on multi-stage, complex navigation tasks. Most
models fail on short subtasks; only fine-tuned or specialized agents (e.g.,
MGDM) show partial success, emphasizing the importance of memory and holistic
understanding in long tasks \parencite{arxiv-lhpr-vln}.
\item \textbf{Other Environments:}
\begin{itemize}
\item Robotics: SayCan uses LLMs to generate action sequences for robots.
\item Web Navigation: WebShop assesses LLMs in e-commerce scenarios.
\item Agent-Based Tasks: AgentBench evaluates LLMs as autonomous agents across
8 diverse environments.
\end{itemize}
\item \textbf{Key Insights:}
\begin{itemize}
\item Current LLMs excel at short, well-defined tasks but face reliability and
error accumulation in long or complex workflows.
\item Progress is exponential: model time horizons for task completion have
been doubling every 7 months, suggesting rapid improvement
\parencite{metr2025}.
\item Economic impact remains limited by current task horizons and challenges
in multi-modality and error handling.
\end{itemize}
\end{itemize}
\end{frame}
% --- Slide: Leaderboards ---
\begin{frame}{Leaderboards}
\begin{itemize}
\item \textbf{Purpose:}
\begin{itemize}
\item Track and compare LLM performance across benchmarks.
\item Provide a standardized way to evaluate model capabilities.
\end{itemize}
\item \textbf{Examples:}
\begin{itemize}
\item Gorilla APIBench Leaderboard \parencite{gorilla}
\item Aider.chat Leaderboards \parencite{aider}
\item Hugging Face Open LLM Leaderboard
\item LMSYS Chatbot Arena
\end{itemize}
\item \textbf{Benefits:}
\begin{itemize}
\item Promote transparency and reproducibility in AI evaluation.
\item Encourage healthy competition and rapid innovation.
\item Help researchers and practitioners identify state-of-the-art models.
\end{itemize}
\item \textbf{Considerations:}
\begin{itemize}
\item Leaderboards may not capture all real-world use cases.
\item Need to ensure benchmarks are robust, diverse, and up-to-date.
\item Risk of overfitting to specific leaderboard metrics.
\end{itemize}
\end{itemize}
\end{frame}
% --- Slide: Prediction Markets ---
\begin{frame}{Prediction Markets for AI Progress}
\begin{itemize}
\item Platforms like Metaculus and Polymarket aggregate forecasts on AI
milestones \parencite{metaculus,polymarket}.
\item Useful for synthesizing expert and crowd expectations about future
capabilities.
\item Complement empirical benchmarks with probabilistic insights.
\item Recent surge in activity: Major platforms have seen notable increases in
trading volume and engagement, especially around high-profile events and
technological milestones \parencite{polymarket,metaculus}.
\item Metaculus specializes in long-term, technology-focused questions, enabling
nuanced tracking of progress in areas like quantum computing and advanced AI
systems \parencite{metaculus}.
\item Polymarket and PredictIt demonstrate how prediction markets can reflect
real-time shifts in collective expectations, sometimes diverging from
traditional expert consensus \parencite{polymarket,predictit}.
\item AI-powered information aggregation is enhancing prediction markets,
allowing for finer-grained, real-time analysis and more targeted event
creation \parencite{hackernoon}.
\item Prediction markets can help identify emerging trends, inform policy, and
guide strategic investments in AI by revealing where consensus and uncertainty
lie.
\end{itemize}
\end{frame}
% --- Slide: International Tracking and Reports ---
\begin{frame}{International Tracking and Reports}
\begin{itemize}
\item \textbf{International AI Safety Report:}
\begin{itemize}
\item Annual, multi-stakeholder assessment of AI progress, risks, and governance
\parencite{aisafety}.
\item Includes expert insights from academia, industry, and civil society.
\end{itemize}
\item \textbf{TrackingAI.org:}
\begin{itemize}
\item Centralized resource for tracking AI system capabilities and benchmarks
\parencite{trackingai}.
\item Features interactive dashboards and regular updates.
\end{itemize}
\item \textbf{Emerging Initiatives:}
\begin{itemize}
\item Regional and international AI observatories (e.g., EU AI Observatory).
\item Collaborative databases for sharing best practices and incident reports.
\end{itemize}
\item \textbf{Key Benefits:}
\begin{itemize}
\item Facilitate global coordination and evidence-based policy.
\item Increase transparency and accountability in AI development.
\item Support proactive risk management and regulatory adaptation.
\end{itemize}
\end{itemize}
\end{frame}
% --- Slide: Challenges and Limitations ---
\begin{frame}{Challenges in Measuring AI Capabilities}
\begin{itemize}
\item \textbf{Multi-modality and real-world complexity remain difficult to
benchmark} \parencite{acl2024}.
\begin{itemize}
\item Integrating text, images, audio, and video introduces interdependencies
that are hard to isolate and measure.
\item Real-world scenarios often involve ambiguous or incomplete information,
making standardized evaluation challenging.
\end{itemize}
\item \textbf{Error accumulation in long-horizon tasks.}
\begin{itemize}
\item As AI systems perform longer sequences of actions or reasoning steps,
small errors can compound, leading to significant inaccuracies.
\item This makes it difficult to assess reliability over extended interactions
or complex workflows.
\end{itemize}
\item \textbf{Subjective tasks (e.g., aesthetics) are hard to evaluate
automatically.}
\begin{itemize}
\item Human judgment is often required for tasks involving creativity, style, or
subjective quality.
\item Automated metrics may fail to capture nuances that are obvious to humans.
\end{itemize}
\item \textbf{Need for continual updates as models and tasks evolve.}
\begin{itemize}
\item Benchmarks quickly become outdated as new models and capabilities emerge.
\item Continuous adaptation of evaluation frameworks is necessary to keep pace
with technological progress.
\end{itemize}
\item \textbf{Generalization across domains remains a key challenge.}
\begin{itemize}
\item Models often perform well on specific benchmarks but struggle to
generalize to unseen or novel situations.
\item Ensuring robustness and adaptability in diverse environments is an ongoing
research problem.
\end{itemize}
\end{itemize}
\end{frame}
% --- Slide: Conclusion ---
\begin{frame}{Conclusion}
\begin{itemize}
\item Measuring AI/LLM capabilities is essential for safe and effective
deployment.
\item Combination of benchmarks, leaderboards, prediction markets, and
international reports provides a holistic view.
\item Ongoing research is needed to address emerging challenges and ensure
robust evaluation.
\item Collaboration among academia, industry, and policymakers is crucial for
advancing evaluation methods.
\item Transparency in AI assessment processes builds public trust and supports
informed decision-making.
\item Future directions should consider ethical implications and societal impact
alongside technical performance.
\end{itemize}
\end{frame}
% --- Optional: Thank You Slide ---
\begin{frame}{Thank You}
\begin{center}
Thank you for your attention!\ Questions?
\end{center}
\end{frame}
% --- Slide: References ---
\begin{frame}[allowframebreaks]{References}
\printbibliography
\end{frame}
\end{document}