Add new talk

2025-06-15 18:34:11 +02:00 · 2025-06-15 18:34:11 +02:00 · 8272a1639e
commit 8272a1639e
parent b7eaf80617
4 changed files with 327 additions and 0 deletions
--- a/2025-06-12-ai-safety-meetup/README.md
+++ b/2025-06-12-ai-safety-meetup/README.md
@ -0,0 +1 @@
+# this talk was almost entirely generated by the perplexity llm
--- a/2025-06-12-ai-safety-meetup/ai.bib
+++ b/2025-06-12-ai-safety-meetup/ai.bib
@ -0,0 +1,64 @@
+@misc{metr2025,
+  author = {METR},
+  title = {Measuring AI Ability to Complete Long Tasks},
+  year = {2025},
+  url = {https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/}
+}
+
+@article{acl2024,
+  author = {Author(s) of ACL Paper}, % Replace with actual author(s)
+  title = {PowerPoint Task Completion: Evaluating LLMs on Multi-Turn, Multi-Modal Instructions},
+  year = {2024},
+  journal = {Findings of the Association for Computational Linguistics},
+  note = {ACL 2024},
+  url = {https://aclanthology.org/2024.findings-acl.514.pdf}
+}
+
+@misc{gorilla,
+  author = {Gorilla Team},
+  title = {Gorilla APIBench Leaderboard},
+  year = {2025},
+  url = {https://gorilla.cs.berkeley.edu/leaderboard.html}
+}
+
+@misc{aider,
+  author = {Aider Team},
+  title = {Aider.chat Leaderboards},
+  year = {2025},
+  url = {https://aider.chat/docs/leaderboards/}
+}
+
+@misc{metaculus,
+  author = {Metaculus},
+  title = {Metaculus: AI Prediction Markets},
+  year = {2025},
+  url = {https://www.metaculus.com/questions/?topic=ai}
+}
+
+@misc{predictit,
+  author = {PredictIt},
+  title = {PredictIt Markets},
+  year = {2025},
+  url = {https://www.predictit.org/}
+}
+@misc{polymarket,
+  author = {Polymarket},
+  title = {Polymarket: AI Markets},
+  year = {2025},
+  url = {https://polymarket.com/markets/all/ai}
+}
+
+@techreport{aisafety,
+  author = {International AI Safety Report Team},
+  title = {International AI Safety Report},
+  year = {2025},
+  institution = {International AI Safety Report},
+  url = {https://arxiv.org/abs/2501.17805}
+}
+
+@misc{trackingai,
+  author = {TrackingAI Team},
+  title = {TrackingAI.org},
+  year = {2025},
+  url = {https://trackingai.org/home}
+}
--- a/2025-06-12-ai-safety-meetup/talk.pdf
+++ b/2025-06-12-ai-safety-meetup/talk.pdf
--- a/2025-06-12-ai-safety-meetup/talk.tex
+++ b/2025-06-12-ai-safety-meetup/talk.tex
@ -0,0 +1,262 @@
+\documentclass[9pt]{beamer}%
+\usetheme{Madrid}%
+\usepackage[backend=biber, style=authoryear]{biblatex}%
+
+\addbibresource{ai.bib} % Replace with your .bib file
+
+\title{Measuring AI/LLM Capabilities and Progress}%
+\author{Your Name}%
+\date{June 2025}%
+
+\begin{document}
+
+\begin{frame}
+  \titlepage
+\end{frame}
+
+% --- Slide: Motivation ---
+\begin{frame}{Why Measure AI/LLM Capabilities?}
+\begin{itemize}
+\item \textbf{Rapid progress} in Large Language Models (LLMs) and AI systems.
+\item \textbf{Need for robust, transparent benchmarks} to track abilities and
+  risks.
+\item \textbf{Inform policy, safety, and deployment decisions.}
+\item \textbf{Ensure accountability and trust} in AI systems used by the public
+  and organizations.
+\item \textbf{Identify and mitigate biases and harmful behaviors} that may
+  emerge in AI outputs.
+\item \textbf{Guide investment and research priorities} by highlighting
+  strengths and weaknesses.
+\item \textbf{Support ethical AI development} through measurable standards and
+  continuous improvement.
+\item \textbf{Facilitate fair comparisons} between different models and
+  approaches.
+\end{itemize}
+\end{frame}
+
+% --- Slide: Frameworks for Measurement (Extended) ---
+\begin{frame}{Frameworks for Measuring Progress}
+\begin{itemize}
+\item \textbf{Task Completion Benchmarks:} Assess ability to perform complex,
+  multi-step tasks; widely used for model comparison and validation.
+\item \textbf{Leaderboards:} Public, standardized comparisons across models;
+  encourage transparency and drive innovation.
+\item \textbf{Prediction Markets:} Aggregate expert and crowd forecasts on AI
+  milestones; leverage collective intelligence for progress tracking.
+\item \textbf{International Reports:} Comprehensive, cross-institutional
+  tracking of AI progress and risks; provide global perspective and policy
+  guidance.
+\item \textbf{Standardized Evaluation Datasets:} Curated datasets for
+  reproducible testing; ensure fair and consistent model assessment.
+\item \textbf{User Feedback and Real-World Deployment:} Collect empirical data
+  from real applications; highlight practical performance and user satisfaction.
+\item \textbf{Longitudinal Studies:} Track AI system improvements over time;
+  identify trends and inflection points in capability growth.
+\item \textbf{Expert Panels and Peer Reviews:} Structured assessments by domain
+  specialists; offer nuanced insights beyond automated metrics.
+\end{itemize}
+\end{frame}
+
+% --- Slide: Task Completion Benchmarks ---
+\begin{frame}{Task Completion Benchmarks}
+\begin{itemize}
+\item \textbf{Long-Horizon Task Evaluation:} METR's research measures LLMs'
+  ability to complete extended, multi-step tasks, highlighting current
+  limitations in reliability and autonomy \parencite{metr2025}. Recent results
+  show that frontier models can autonomously complete tasks with a time horizon
+  of about 40 minutes, but not yet work requiring days or weeks
+  \parencite{metr2025,nature2025}.
+\item \textbf{PowerPoint Task Completion (PPTC):} Evaluates LLMs on multi-turn,
+  multi-modal instructions within PowerPoint, revealing challenges in tool use,
+  error accumulation, and multi-modality \parencite{acl2024}. GPT-4 leads in
+  performance, but all models struggle with non-text operations and long
+  sessions \parencite{acl2024}.
+\item \textbf{Long-Horizon Vision-Language Navigation (LHPR-VLN):} Benchmarks
+  LLMs and specialized agents on multi-stage, complex navigation tasks. Most
+  models fail on short subtasks; only fine-tuned or specialized agents (e.g.,
+  MGDM) show partial success, emphasizing the importance of memory and holistic
+  understanding in long tasks \parencite{arxiv-lhpr-vln}.
+\item \textbf{Other Environments:}
+  \begin{itemize}
+  \item Robotics: SayCan uses LLMs to generate action sequences for robots.
+  \item Web Navigation: WebShop assesses LLMs in e-commerce scenarios.
+  \item Agent-Based Tasks: AgentBench evaluates LLMs as autonomous agents across
+    8 diverse environments.
+  \end{itemize}
+\item \textbf{Key Insights:}
+  \begin{itemize}
+  \item Current LLMs excel at short, well-defined tasks but face reliability and
+    error accumulation in long or complex workflows.
+  \item Progress is exponential: model time horizons for task completion have
+    been doubling every 7 months, suggesting rapid improvement
+    \parencite{metr2025}.
+  \item Economic impact remains limited by current task horizons and challenges
+    in multi-modality and error handling.
+  \end{itemize}
+\end{itemize}
+\end{frame}
+
+% --- Slide: Leaderboards ---
+\begin{frame}{Leaderboards}
+\begin{itemize}
+\item \textbf{Purpose:}
+    \begin{itemize}
+    \item Track and compare LLM performance across benchmarks.
+    \item Provide a standardized way to evaluate model capabilities.
+    \end{itemize}
+  \item \textbf{Examples:}
+    \begin{itemize}
+    \item Gorilla APIBench Leaderboard \parencite{gorilla}
+    \item Aider.chat Leaderboards \parencite{aider}
+    \item Hugging Face Open LLM Leaderboard
+    \item LMSYS Chatbot Arena
+    \end{itemize}
+  \item \textbf{Benefits:}
+    \begin{itemize}
+    \item Promote transparency and reproducibility in AI evaluation.
+    \item Encourage healthy competition and rapid innovation.
+    \item Help researchers and practitioners identify state-of-the-art models.
+    \end{itemize}
+  \item \textbf{Considerations:}
+    \begin{itemize}
+    \item Leaderboards may not capture all real-world use cases.
+    \item Need to ensure benchmarks are robust, diverse, and up-to-date.
+    \item Risk of overfitting to specific leaderboard metrics.
+    \end{itemize}
+\end{itemize}
+\end{frame}
+
+% --- Slide: Prediction Markets ---
+\begin{frame}{Prediction Markets for AI Progress}
+\begin{itemize}
+\item Platforms like Metaculus and Polymarket aggregate forecasts on AI
+  milestones \parencite{metaculus,polymarket}.
+\item Useful for synthesizing expert and crowd expectations about future
+  capabilities.
+\item Complement empirical benchmarks with probabilistic insights.
+\item Recent surge in activity: Major platforms have seen notable increases in
+  trading volume and engagement, especially around high-profile events and
+  technological milestones \parencite{polymarket,metaculus}.
+\item Metaculus specializes in long-term, technology-focused questions, enabling
+  nuanced tracking of progress in areas like quantum computing and advanced AI
+  systems \parencite{metaculus}.
+\item Polymarket and PredictIt demonstrate how prediction markets can reflect
+  real-time shifts in collective expectations, sometimes diverging from
+  traditional expert consensus \parencite{polymarket,predictit}.
+\item AI-powered information aggregation is enhancing prediction markets,
+  allowing for finer-grained, real-time analysis and more targeted event
+  creation \parencite{hackernoon}.
+\item Prediction markets can help identify emerging trends, inform policy, and
+  guide strategic investments in AI by revealing where consensus and uncertainty
+  lie.
+\end{itemize}
+\end{frame}
+
+% --- Slide: International Tracking and Reports ---
+\begin{frame}{International Tracking and Reports}
+\begin{itemize}
+\item \textbf{International AI Safety Report:}
+\begin{itemize}
+\item Annual, multi-stakeholder assessment of AI progress, risks, and governance
+  \parencite{aisafety}.
+\item Includes expert insights from academia, industry, and civil society.
+\end{itemize}
+
+\item \textbf{TrackingAI.org:}
+\begin{itemize}
+\item Centralized resource for tracking AI system capabilities and benchmarks
+  \parencite{trackingai}.
+\item Features interactive dashboards and regular updates.
+\end{itemize}
+
+\item \textbf{Emerging Initiatives:}
+\begin{itemize}
+\item Regional and international AI observatories (e.g., EU AI Observatory).
+\item Collaborative databases for sharing best practices and incident reports.
+\end{itemize}
+
+\item \textbf{Key Benefits:}
+\begin{itemize}
+\item Facilitate global coordination and evidence-based policy.
+\item Increase transparency and accountability in AI development.
+\item Support proactive risk management and regulatory adaptation.
+\end{itemize}
+\end{itemize}
+\end{frame}
+
+% --- Slide: Challenges and Limitations ---
+\begin{frame}{Challenges in Measuring AI Capabilities}
+\begin{itemize}
+\item \textbf{Multi-modality and real-world complexity remain difficult to
+  benchmark} \parencite{acl2024}.
+\begin{itemize}
+\item Integrating text, images, audio, and video introduces interdependencies
+  that are hard to isolate and measure.
+\item Real-world scenarios often involve ambiguous or incomplete information,
+  making standardized evaluation challenging.
+\end{itemize}
+
+\item \textbf{Error accumulation in long-horizon tasks.}
+\begin{itemize}
+\item As AI systems perform longer sequences of actions or reasoning steps,
+  small errors can compound, leading to significant inaccuracies.
+\item This makes it difficult to assess reliability over extended interactions
+  or complex workflows.
+\end{itemize}
+
+\item \textbf{Subjective tasks (e.g., aesthetics) are hard to evaluate
+  automatically.}
+\begin{itemize}
+\item Human judgment is often required for tasks involving creativity, style, or
+  subjective quality.
+\item Automated metrics may fail to capture nuances that are obvious to humans.
+\end{itemize}
+
+\item \textbf{Need for continual updates as models and tasks evolve.}
+\begin{itemize}
+\item Benchmarks quickly become outdated as new models and capabilities emerge.
+\item Continuous adaptation of evaluation frameworks is necessary to keep pace
+  with technological progress.
+\end{itemize}
+
+\item \textbf{Generalization across domains remains a key challenge.}
+\begin{itemize}
+\item Models often perform well on specific benchmarks but struggle to
+  generalize to unseen or novel situations.
+\item Ensuring robustness and adaptability in diverse environments is an ongoing
+  research problem.
+\end{itemize}
+\end{itemize}
+\end{frame}
+
+% --- Slide: Conclusion ---
+\begin{frame}{Conclusion}
+\begin{itemize}
+\item Measuring AI/LLM capabilities is essential for safe and effective
+  deployment.
+\item Combination of benchmarks, leaderboards, prediction markets, and
+  international reports provides a holistic view.
+\item Ongoing research is needed to address emerging challenges and ensure
+  robust evaluation.
+\item Collaboration among academia, industry, and policymakers is crucial for
+  advancing evaluation methods.
+\item Transparency in AI assessment processes builds public trust and supports
+  informed decision-making.
+\item Future directions should consider ethical implications and societal impact
+  alongside technical performance.
+\end{itemize}
+\end{frame}
+
+% --- Optional: Thank You Slide ---
+\begin{frame}{Thank You}
+\begin{center}
+  Thank you for your attention!\ Questions?
+\end{center}
+\end{frame}
+
+% --- Slide: References ---
+\begin{frame}[allowframebreaks]{References}
+  \printbibliography
+\end{frame}
+
+\end{document}
				`@ -0,0 +1 @@`
				`# this talk was almost entirely generated by the perplexity llm`