diff --git a/2025-06-12-ai-safety-meetup/README.md b/2025-06-12-ai-safety-meetup/README.md new file mode 100644 index 0000000..7208947 --- /dev/null +++ b/2025-06-12-ai-safety-meetup/README.md @@ -0,0 +1 @@ +# this talk was almost entirely generated by the perplexity llm diff --git a/2025-06-12-ai-safety-meetup/ai.bib b/2025-06-12-ai-safety-meetup/ai.bib new file mode 100644 index 0000000..f416024 --- /dev/null +++ b/2025-06-12-ai-safety-meetup/ai.bib @@ -0,0 +1,64 @@ +@misc{metr2025, + author = {METR}, + title = {Measuring AI Ability to Complete Long Tasks}, + year = {2025}, + url = {https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/} +} + +@article{acl2024, + author = {Author(s) of ACL Paper}, % Replace with actual author(s) + title = {PowerPoint Task Completion: Evaluating LLMs on Multi-Turn, Multi-Modal Instructions}, + year = {2024}, + journal = {Findings of the Association for Computational Linguistics}, + note = {ACL 2024}, + url = {https://aclanthology.org/2024.findings-acl.514.pdf} +} + +@misc{gorilla, + author = {Gorilla Team}, + title = {Gorilla APIBench Leaderboard}, + year = {2025}, + url = {https://gorilla.cs.berkeley.edu/leaderboard.html} +} + +@misc{aider, + author = {Aider Team}, + title = {Aider.chat Leaderboards}, + year = {2025}, + url = {https://aider.chat/docs/leaderboards/} +} + +@misc{metaculus, + author = {Metaculus}, + title = {Metaculus: AI Prediction Markets}, + year = {2025}, + url = {https://www.metaculus.com/questions/?topic=ai} +} + +@misc{predictit, + author = {PredictIt}, + title = {PredictIt Markets}, + year = {2025}, + url = {https://www.predictit.org/} +} +@misc{polymarket, + author = {Polymarket}, + title = {Polymarket: AI Markets}, + year = {2025}, + url = {https://polymarket.com/markets/all/ai} +} + +@techreport{aisafety, + author = {International AI Safety Report Team}, + title = {International AI Safety Report}, + year = {2025}, + institution = {International AI Safety Report}, + url = {https://arxiv.org/abs/2501.17805} +} + +@misc{trackingai, + author = {TrackingAI Team}, + title = {TrackingAI.org}, + year = {2025}, + url = {https://trackingai.org/home} +} diff --git a/2025-06-12-ai-safety-meetup/talk.pdf b/2025-06-12-ai-safety-meetup/talk.pdf new file mode 100644 index 0000000..59e4cc1 Binary files /dev/null and b/2025-06-12-ai-safety-meetup/talk.pdf differ diff --git a/2025-06-12-ai-safety-meetup/talk.tex b/2025-06-12-ai-safety-meetup/talk.tex new file mode 100644 index 0000000..fe92c3d --- /dev/null +++ b/2025-06-12-ai-safety-meetup/talk.tex @@ -0,0 +1,262 @@ +\documentclass[9pt]{beamer}% +\usetheme{Madrid}% +\usepackage[backend=biber, style=authoryear]{biblatex}% + +\addbibresource{ai.bib} % Replace with your .bib file + +\title{Measuring AI/LLM Capabilities and Progress}% +\author{Your Name}% +\date{June 2025}% + +\begin{document} + +\begin{frame} + \titlepage +\end{frame} + +% --- Slide: Motivation --- +\begin{frame}{Why Measure AI/LLM Capabilities?} +\begin{itemize} +\item \textbf{Rapid progress} in Large Language Models (LLMs) and AI systems. +\item \textbf{Need for robust, transparent benchmarks} to track abilities and + risks. +\item \textbf{Inform policy, safety, and deployment decisions.} +\item \textbf{Ensure accountability and trust} in AI systems used by the public + and organizations. +\item \textbf{Identify and mitigate biases and harmful behaviors} that may + emerge in AI outputs. +\item \textbf{Guide investment and research priorities} by highlighting + strengths and weaknesses. +\item \textbf{Support ethical AI development} through measurable standards and + continuous improvement. +\item \textbf{Facilitate fair comparisons} between different models and + approaches. +\end{itemize} +\end{frame} + +% --- Slide: Frameworks for Measurement (Extended) --- +\begin{frame}{Frameworks for Measuring Progress} +\begin{itemize} +\item \textbf{Task Completion Benchmarks:} Assess ability to perform complex, + multi-step tasks; widely used for model comparison and validation. +\item \textbf{Leaderboards:} Public, standardized comparisons across models; + encourage transparency and drive innovation. +\item \textbf{Prediction Markets:} Aggregate expert and crowd forecasts on AI + milestones; leverage collective intelligence for progress tracking. +\item \textbf{International Reports:} Comprehensive, cross-institutional + tracking of AI progress and risks; provide global perspective and policy + guidance. +\item \textbf{Standardized Evaluation Datasets:} Curated datasets for + reproducible testing; ensure fair and consistent model assessment. +\item \textbf{User Feedback and Real-World Deployment:} Collect empirical data + from real applications; highlight practical performance and user satisfaction. +\item \textbf{Longitudinal Studies:} Track AI system improvements over time; + identify trends and inflection points in capability growth. +\item \textbf{Expert Panels and Peer Reviews:} Structured assessments by domain + specialists; offer nuanced insights beyond automated metrics. +\end{itemize} +\end{frame} + +% --- Slide: Task Completion Benchmarks --- +\begin{frame}{Task Completion Benchmarks} +\begin{itemize} +\item \textbf{Long-Horizon Task Evaluation:} METR's research measures LLMs' + ability to complete extended, multi-step tasks, highlighting current + limitations in reliability and autonomy \parencite{metr2025}. Recent results + show that frontier models can autonomously complete tasks with a time horizon + of about 40 minutes, but not yet work requiring days or weeks + \parencite{metr2025,nature2025}. +\item \textbf{PowerPoint Task Completion (PPTC):} Evaluates LLMs on multi-turn, + multi-modal instructions within PowerPoint, revealing challenges in tool use, + error accumulation, and multi-modality \parencite{acl2024}. GPT-4 leads in + performance, but all models struggle with non-text operations and long + sessions \parencite{acl2024}. +\item \textbf{Long-Horizon Vision-Language Navigation (LHPR-VLN):} Benchmarks + LLMs and specialized agents on multi-stage, complex navigation tasks. Most + models fail on short subtasks; only fine-tuned or specialized agents (e.g., + MGDM) show partial success, emphasizing the importance of memory and holistic + understanding in long tasks \parencite{arxiv-lhpr-vln}. +\item \textbf{Other Environments:} + \begin{itemize} + \item Robotics: SayCan uses LLMs to generate action sequences for robots. + \item Web Navigation: WebShop assesses LLMs in e-commerce scenarios. + \item Agent-Based Tasks: AgentBench evaluates LLMs as autonomous agents across + 8 diverse environments. + \end{itemize} +\item \textbf{Key Insights:} + \begin{itemize} + \item Current LLMs excel at short, well-defined tasks but face reliability and + error accumulation in long or complex workflows. + \item Progress is exponential: model time horizons for task completion have + been doubling every 7 months, suggesting rapid improvement + \parencite{metr2025}. + \item Economic impact remains limited by current task horizons and challenges + in multi-modality and error handling. + \end{itemize} +\end{itemize} +\end{frame} + +% --- Slide: Leaderboards --- +\begin{frame}{Leaderboards} +\begin{itemize} +\item \textbf{Purpose:} + \begin{itemize} + \item Track and compare LLM performance across benchmarks. + \item Provide a standardized way to evaluate model capabilities. + \end{itemize} + \item \textbf{Examples:} + \begin{itemize} + \item Gorilla APIBench Leaderboard \parencite{gorilla} + \item Aider.chat Leaderboards \parencite{aider} + \item Hugging Face Open LLM Leaderboard + \item LMSYS Chatbot Arena + \end{itemize} + \item \textbf{Benefits:} + \begin{itemize} + \item Promote transparency and reproducibility in AI evaluation. + \item Encourage healthy competition and rapid innovation. + \item Help researchers and practitioners identify state-of-the-art models. + \end{itemize} + \item \textbf{Considerations:} + \begin{itemize} + \item Leaderboards may not capture all real-world use cases. + \item Need to ensure benchmarks are robust, diverse, and up-to-date. + \item Risk of overfitting to specific leaderboard metrics. + \end{itemize} +\end{itemize} +\end{frame} + +% --- Slide: Prediction Markets --- +\begin{frame}{Prediction Markets for AI Progress} +\begin{itemize} +\item Platforms like Metaculus and Polymarket aggregate forecasts on AI + milestones \parencite{metaculus,polymarket}. +\item Useful for synthesizing expert and crowd expectations about future + capabilities. +\item Complement empirical benchmarks with probabilistic insights. +\item Recent surge in activity: Major platforms have seen notable increases in + trading volume and engagement, especially around high-profile events and + technological milestones \parencite{polymarket,metaculus}. +\item Metaculus specializes in long-term, technology-focused questions, enabling + nuanced tracking of progress in areas like quantum computing and advanced AI + systems \parencite{metaculus}. +\item Polymarket and PredictIt demonstrate how prediction markets can reflect + real-time shifts in collective expectations, sometimes diverging from + traditional expert consensus \parencite{polymarket,predictit}. +\item AI-powered information aggregation is enhancing prediction markets, + allowing for finer-grained, real-time analysis and more targeted event + creation \parencite{hackernoon}. +\item Prediction markets can help identify emerging trends, inform policy, and + guide strategic investments in AI by revealing where consensus and uncertainty + lie. +\end{itemize} +\end{frame} + +% --- Slide: International Tracking and Reports --- +\begin{frame}{International Tracking and Reports} +\begin{itemize} +\item \textbf{International AI Safety Report:} +\begin{itemize} +\item Annual, multi-stakeholder assessment of AI progress, risks, and governance + \parencite{aisafety}. +\item Includes expert insights from academia, industry, and civil society. +\end{itemize} + +\item \textbf{TrackingAI.org:} +\begin{itemize} +\item Centralized resource for tracking AI system capabilities and benchmarks + \parencite{trackingai}. +\item Features interactive dashboards and regular updates. +\end{itemize} + +\item \textbf{Emerging Initiatives:} +\begin{itemize} +\item Regional and international AI observatories (e.g., EU AI Observatory). +\item Collaborative databases for sharing best practices and incident reports. +\end{itemize} + +\item \textbf{Key Benefits:} +\begin{itemize} +\item Facilitate global coordination and evidence-based policy. +\item Increase transparency and accountability in AI development. +\item Support proactive risk management and regulatory adaptation. +\end{itemize} +\end{itemize} +\end{frame} + +% --- Slide: Challenges and Limitations --- +\begin{frame}{Challenges in Measuring AI Capabilities} +\begin{itemize} +\item \textbf{Multi-modality and real-world complexity remain difficult to + benchmark} \parencite{acl2024}. +\begin{itemize} +\item Integrating text, images, audio, and video introduces interdependencies + that are hard to isolate and measure. +\item Real-world scenarios often involve ambiguous or incomplete information, + making standardized evaluation challenging. +\end{itemize} + +\item \textbf{Error accumulation in long-horizon tasks.} +\begin{itemize} +\item As AI systems perform longer sequences of actions or reasoning steps, + small errors can compound, leading to significant inaccuracies. +\item This makes it difficult to assess reliability over extended interactions + or complex workflows. +\end{itemize} + +\item \textbf{Subjective tasks (e.g., aesthetics) are hard to evaluate + automatically.} +\begin{itemize} +\item Human judgment is often required for tasks involving creativity, style, or + subjective quality. +\item Automated metrics may fail to capture nuances that are obvious to humans. +\end{itemize} + +\item \textbf{Need for continual updates as models and tasks evolve.} +\begin{itemize} +\item Benchmarks quickly become outdated as new models and capabilities emerge. +\item Continuous adaptation of evaluation frameworks is necessary to keep pace + with technological progress. +\end{itemize} + +\item \textbf{Generalization across domains remains a key challenge.} +\begin{itemize} +\item Models often perform well on specific benchmarks but struggle to + generalize to unseen or novel situations. +\item Ensuring robustness and adaptability in diverse environments is an ongoing + research problem. +\end{itemize} +\end{itemize} +\end{frame} + +% --- Slide: Conclusion --- +\begin{frame}{Conclusion} +\begin{itemize} +\item Measuring AI/LLM capabilities is essential for safe and effective + deployment. +\item Combination of benchmarks, leaderboards, prediction markets, and + international reports provides a holistic view. +\item Ongoing research is needed to address emerging challenges and ensure + robust evaluation. +\item Collaboration among academia, industry, and policymakers is crucial for + advancing evaluation methods. +\item Transparency in AI assessment processes builds public trust and supports + informed decision-making. +\item Future directions should consider ethical implications and societal impact + alongside technical performance. +\end{itemize} +\end{frame} + +% --- Optional: Thank You Slide --- +\begin{frame}{Thank You} +\begin{center} + Thank you for your attention!\ Questions? +\end{center} +\end{frame} + +% --- Slide: References --- +\begin{frame}[allowframebreaks]{References} + \printbibliography +\end{frame} + +\end{document}