Add new talk
This commit is contained in:
parent
b7eaf80617
commit
8272a1639e
4 changed files with 327 additions and 0 deletions
1
2025-06-12-ai-safety-meetup/README.md
Normal file
1
2025-06-12-ai-safety-meetup/README.md
Normal file
|
@ -0,0 +1 @@
|
|||
# this talk was almost entirely generated by the perplexity llm
|
64
2025-06-12-ai-safety-meetup/ai.bib
Normal file
64
2025-06-12-ai-safety-meetup/ai.bib
Normal file
|
@ -0,0 +1,64 @@
|
|||
@misc{metr2025,
|
||||
author = {METR},
|
||||
title = {Measuring AI Ability to Complete Long Tasks},
|
||||
year = {2025},
|
||||
url = {https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/}
|
||||
}
|
||||
|
||||
@article{acl2024,
|
||||
author = {Author(s) of ACL Paper}, % Replace with actual author(s)
|
||||
title = {PowerPoint Task Completion: Evaluating LLMs on Multi-Turn, Multi-Modal Instructions},
|
||||
year = {2024},
|
||||
journal = {Findings of the Association for Computational Linguistics},
|
||||
note = {ACL 2024},
|
||||
url = {https://aclanthology.org/2024.findings-acl.514.pdf}
|
||||
}
|
||||
|
||||
@misc{gorilla,
|
||||
author = {Gorilla Team},
|
||||
title = {Gorilla APIBench Leaderboard},
|
||||
year = {2025},
|
||||
url = {https://gorilla.cs.berkeley.edu/leaderboard.html}
|
||||
}
|
||||
|
||||
@misc{aider,
|
||||
author = {Aider Team},
|
||||
title = {Aider.chat Leaderboards},
|
||||
year = {2025},
|
||||
url = {https://aider.chat/docs/leaderboards/}
|
||||
}
|
||||
|
||||
@misc{metaculus,
|
||||
author = {Metaculus},
|
||||
title = {Metaculus: AI Prediction Markets},
|
||||
year = {2025},
|
||||
url = {https://www.metaculus.com/questions/?topic=ai}
|
||||
}
|
||||
|
||||
@misc{predictit,
|
||||
author = {PredictIt},
|
||||
title = {PredictIt Markets},
|
||||
year = {2025},
|
||||
url = {https://www.predictit.org/}
|
||||
}
|
||||
@misc{polymarket,
|
||||
author = {Polymarket},
|
||||
title = {Polymarket: AI Markets},
|
||||
year = {2025},
|
||||
url = {https://polymarket.com/markets/all/ai}
|
||||
}
|
||||
|
||||
@techreport{aisafety,
|
||||
author = {International AI Safety Report Team},
|
||||
title = {International AI Safety Report},
|
||||
year = {2025},
|
||||
institution = {International AI Safety Report},
|
||||
url = {https://arxiv.org/abs/2501.17805}
|
||||
}
|
||||
|
||||
@misc{trackingai,
|
||||
author = {TrackingAI Team},
|
||||
title = {TrackingAI.org},
|
||||
year = {2025},
|
||||
url = {https://trackingai.org/home}
|
||||
}
|
BIN
2025-06-12-ai-safety-meetup/talk.pdf
Normal file
BIN
2025-06-12-ai-safety-meetup/talk.pdf
Normal file
Binary file not shown.
262
2025-06-12-ai-safety-meetup/talk.tex
Normal file
262
2025-06-12-ai-safety-meetup/talk.tex
Normal file
|
@ -0,0 +1,262 @@
|
|||
\documentclass[9pt]{beamer}%
|
||||
\usetheme{Madrid}%
|
||||
\usepackage[backend=biber, style=authoryear]{biblatex}%
|
||||
|
||||
\addbibresource{ai.bib} % Replace with your .bib file
|
||||
|
||||
\title{Measuring AI/LLM Capabilities and Progress}%
|
||||
\author{Your Name}%
|
||||
\date{June 2025}%
|
||||
|
||||
\begin{document}
|
||||
|
||||
\begin{frame}
|
||||
\titlepage
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: Motivation ---
|
||||
\begin{frame}{Why Measure AI/LLM Capabilities?}
|
||||
\begin{itemize}
|
||||
\item \textbf{Rapid progress} in Large Language Models (LLMs) and AI systems.
|
||||
\item \textbf{Need for robust, transparent benchmarks} to track abilities and
|
||||
risks.
|
||||
\item \textbf{Inform policy, safety, and deployment decisions.}
|
||||
\item \textbf{Ensure accountability and trust} in AI systems used by the public
|
||||
and organizations.
|
||||
\item \textbf{Identify and mitigate biases and harmful behaviors} that may
|
||||
emerge in AI outputs.
|
||||
\item \textbf{Guide investment and research priorities} by highlighting
|
||||
strengths and weaknesses.
|
||||
\item \textbf{Support ethical AI development} through measurable standards and
|
||||
continuous improvement.
|
||||
\item \textbf{Facilitate fair comparisons} between different models and
|
||||
approaches.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: Frameworks for Measurement (Extended) ---
|
||||
\begin{frame}{Frameworks for Measuring Progress}
|
||||
\begin{itemize}
|
||||
\item \textbf{Task Completion Benchmarks:} Assess ability to perform complex,
|
||||
multi-step tasks; widely used for model comparison and validation.
|
||||
\item \textbf{Leaderboards:} Public, standardized comparisons across models;
|
||||
encourage transparency and drive innovation.
|
||||
\item \textbf{Prediction Markets:} Aggregate expert and crowd forecasts on AI
|
||||
milestones; leverage collective intelligence for progress tracking.
|
||||
\item \textbf{International Reports:} Comprehensive, cross-institutional
|
||||
tracking of AI progress and risks; provide global perspective and policy
|
||||
guidance.
|
||||
\item \textbf{Standardized Evaluation Datasets:} Curated datasets for
|
||||
reproducible testing; ensure fair and consistent model assessment.
|
||||
\item \textbf{User Feedback and Real-World Deployment:} Collect empirical data
|
||||
from real applications; highlight practical performance and user satisfaction.
|
||||
\item \textbf{Longitudinal Studies:} Track AI system improvements over time;
|
||||
identify trends and inflection points in capability growth.
|
||||
\item \textbf{Expert Panels and Peer Reviews:} Structured assessments by domain
|
||||
specialists; offer nuanced insights beyond automated metrics.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: Task Completion Benchmarks ---
|
||||
\begin{frame}{Task Completion Benchmarks}
|
||||
\begin{itemize}
|
||||
\item \textbf{Long-Horizon Task Evaluation:} METR's research measures LLMs'
|
||||
ability to complete extended, multi-step tasks, highlighting current
|
||||
limitations in reliability and autonomy \parencite{metr2025}. Recent results
|
||||
show that frontier models can autonomously complete tasks with a time horizon
|
||||
of about 40 minutes, but not yet work requiring days or weeks
|
||||
\parencite{metr2025,nature2025}.
|
||||
\item \textbf{PowerPoint Task Completion (PPTC):} Evaluates LLMs on multi-turn,
|
||||
multi-modal instructions within PowerPoint, revealing challenges in tool use,
|
||||
error accumulation, and multi-modality \parencite{acl2024}. GPT-4 leads in
|
||||
performance, but all models struggle with non-text operations and long
|
||||
sessions \parencite{acl2024}.
|
||||
\item \textbf{Long-Horizon Vision-Language Navigation (LHPR-VLN):} Benchmarks
|
||||
LLMs and specialized agents on multi-stage, complex navigation tasks. Most
|
||||
models fail on short subtasks; only fine-tuned or specialized agents (e.g.,
|
||||
MGDM) show partial success, emphasizing the importance of memory and holistic
|
||||
understanding in long tasks \parencite{arxiv-lhpr-vln}.
|
||||
\item \textbf{Other Environments:}
|
||||
\begin{itemize}
|
||||
\item Robotics: SayCan uses LLMs to generate action sequences for robots.
|
||||
\item Web Navigation: WebShop assesses LLMs in e-commerce scenarios.
|
||||
\item Agent-Based Tasks: AgentBench evaluates LLMs as autonomous agents across
|
||||
8 diverse environments.
|
||||
\end{itemize}
|
||||
\item \textbf{Key Insights:}
|
||||
\begin{itemize}
|
||||
\item Current LLMs excel at short, well-defined tasks but face reliability and
|
||||
error accumulation in long or complex workflows.
|
||||
\item Progress is exponential: model time horizons for task completion have
|
||||
been doubling every 7 months, suggesting rapid improvement
|
||||
\parencite{metr2025}.
|
||||
\item Economic impact remains limited by current task horizons and challenges
|
||||
in multi-modality and error handling.
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: Leaderboards ---
|
||||
\begin{frame}{Leaderboards}
|
||||
\begin{itemize}
|
||||
\item \textbf{Purpose:}
|
||||
\begin{itemize}
|
||||
\item Track and compare LLM performance across benchmarks.
|
||||
\item Provide a standardized way to evaluate model capabilities.
|
||||
\end{itemize}
|
||||
\item \textbf{Examples:}
|
||||
\begin{itemize}
|
||||
\item Gorilla APIBench Leaderboard \parencite{gorilla}
|
||||
\item Aider.chat Leaderboards \parencite{aider}
|
||||
\item Hugging Face Open LLM Leaderboard
|
||||
\item LMSYS Chatbot Arena
|
||||
\end{itemize}
|
||||
\item \textbf{Benefits:}
|
||||
\begin{itemize}
|
||||
\item Promote transparency and reproducibility in AI evaluation.
|
||||
\item Encourage healthy competition and rapid innovation.
|
||||
\item Help researchers and practitioners identify state-of-the-art models.
|
||||
\end{itemize}
|
||||
\item \textbf{Considerations:}
|
||||
\begin{itemize}
|
||||
\item Leaderboards may not capture all real-world use cases.
|
||||
\item Need to ensure benchmarks are robust, diverse, and up-to-date.
|
||||
\item Risk of overfitting to specific leaderboard metrics.
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: Prediction Markets ---
|
||||
\begin{frame}{Prediction Markets for AI Progress}
|
||||
\begin{itemize}
|
||||
\item Platforms like Metaculus and Polymarket aggregate forecasts on AI
|
||||
milestones \parencite{metaculus,polymarket}.
|
||||
\item Useful for synthesizing expert and crowd expectations about future
|
||||
capabilities.
|
||||
\item Complement empirical benchmarks with probabilistic insights.
|
||||
\item Recent surge in activity: Major platforms have seen notable increases in
|
||||
trading volume and engagement, especially around high-profile events and
|
||||
technological milestones \parencite{polymarket,metaculus}.
|
||||
\item Metaculus specializes in long-term, technology-focused questions, enabling
|
||||
nuanced tracking of progress in areas like quantum computing and advanced AI
|
||||
systems \parencite{metaculus}.
|
||||
\item Polymarket and PredictIt demonstrate how prediction markets can reflect
|
||||
real-time shifts in collective expectations, sometimes diverging from
|
||||
traditional expert consensus \parencite{polymarket,predictit}.
|
||||
\item AI-powered information aggregation is enhancing prediction markets,
|
||||
allowing for finer-grained, real-time analysis and more targeted event
|
||||
creation \parencite{hackernoon}.
|
||||
\item Prediction markets can help identify emerging trends, inform policy, and
|
||||
guide strategic investments in AI by revealing where consensus and uncertainty
|
||||
lie.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: International Tracking and Reports ---
|
||||
\begin{frame}{International Tracking and Reports}
|
||||
\begin{itemize}
|
||||
\item \textbf{International AI Safety Report:}
|
||||
\begin{itemize}
|
||||
\item Annual, multi-stakeholder assessment of AI progress, risks, and governance
|
||||
\parencite{aisafety}.
|
||||
\item Includes expert insights from academia, industry, and civil society.
|
||||
\end{itemize}
|
||||
|
||||
\item \textbf{TrackingAI.org:}
|
||||
\begin{itemize}
|
||||
\item Centralized resource for tracking AI system capabilities and benchmarks
|
||||
\parencite{trackingai}.
|
||||
\item Features interactive dashboards and regular updates.
|
||||
\end{itemize}
|
||||
|
||||
\item \textbf{Emerging Initiatives:}
|
||||
\begin{itemize}
|
||||
\item Regional and international AI observatories (e.g., EU AI Observatory).
|
||||
\item Collaborative databases for sharing best practices and incident reports.
|
||||
\end{itemize}
|
||||
|
||||
\item \textbf{Key Benefits:}
|
||||
\begin{itemize}
|
||||
\item Facilitate global coordination and evidence-based policy.
|
||||
\item Increase transparency and accountability in AI development.
|
||||
\item Support proactive risk management and regulatory adaptation.
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: Challenges and Limitations ---
|
||||
\begin{frame}{Challenges in Measuring AI Capabilities}
|
||||
\begin{itemize}
|
||||
\item \textbf{Multi-modality and real-world complexity remain difficult to
|
||||
benchmark} \parencite{acl2024}.
|
||||
\begin{itemize}
|
||||
\item Integrating text, images, audio, and video introduces interdependencies
|
||||
that are hard to isolate and measure.
|
||||
\item Real-world scenarios often involve ambiguous or incomplete information,
|
||||
making standardized evaluation challenging.
|
||||
\end{itemize}
|
||||
|
||||
\item \textbf{Error accumulation in long-horizon tasks.}
|
||||
\begin{itemize}
|
||||
\item As AI systems perform longer sequences of actions or reasoning steps,
|
||||
small errors can compound, leading to significant inaccuracies.
|
||||
\item This makes it difficult to assess reliability over extended interactions
|
||||
or complex workflows.
|
||||
\end{itemize}
|
||||
|
||||
\item \textbf{Subjective tasks (e.g., aesthetics) are hard to evaluate
|
||||
automatically.}
|
||||
\begin{itemize}
|
||||
\item Human judgment is often required for tasks involving creativity, style, or
|
||||
subjective quality.
|
||||
\item Automated metrics may fail to capture nuances that are obvious to humans.
|
||||
\end{itemize}
|
||||
|
||||
\item \textbf{Need for continual updates as models and tasks evolve.}
|
||||
\begin{itemize}
|
||||
\item Benchmarks quickly become outdated as new models and capabilities emerge.
|
||||
\item Continuous adaptation of evaluation frameworks is necessary to keep pace
|
||||
with technological progress.
|
||||
\end{itemize}
|
||||
|
||||
\item \textbf{Generalization across domains remains a key challenge.}
|
||||
\begin{itemize}
|
||||
\item Models often perform well on specific benchmarks but struggle to
|
||||
generalize to unseen or novel situations.
|
||||
\item Ensuring robustness and adaptability in diverse environments is an ongoing
|
||||
research problem.
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: Conclusion ---
|
||||
\begin{frame}{Conclusion}
|
||||
\begin{itemize}
|
||||
\item Measuring AI/LLM capabilities is essential for safe and effective
|
||||
deployment.
|
||||
\item Combination of benchmarks, leaderboards, prediction markets, and
|
||||
international reports provides a holistic view.
|
||||
\item Ongoing research is needed to address emerging challenges and ensure
|
||||
robust evaluation.
|
||||
\item Collaboration among academia, industry, and policymakers is crucial for
|
||||
advancing evaluation methods.
|
||||
\item Transparency in AI assessment processes builds public trust and supports
|
||||
informed decision-making.
|
||||
\item Future directions should consider ethical implications and societal impact
|
||||
alongside technical performance.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
% --- Optional: Thank You Slide ---
|
||||
\begin{frame}{Thank You}
|
||||
\begin{center}
|
||||
Thank you for your attention!\ Questions?
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide: References ---
|
||||
\begin{frame}[allowframebreaks]{References}
|
||||
\printbibliography
|
||||
\end{frame}
|
||||
|
||||
\end{document}
|
Loading…
Add table
Add a link
Reference in a new issue