Add many doc pages and fix doc tests

2024-12-16 14:55:12 +01:00 · 2024-12-16 14:55:12 +01:00 · 5231162274
parent 0eb582bdba
commit 5231162274
35 changed files with 828 additions and 3799 deletions
--- a/docs/source/_config.py
+++ b/docs/source/_config.py
@ -1,9 +1,9 @@
 # docstyle-ignore
 INSTALL_CONTENT = """
 # Transformers installation
-! pip install transformers datasets evaluate accelerate
+! pip install agents
 # To install from source instead of the last release, comment the command above and uncomment the following one.
-# ! pip install git+https://github.com/huggingface/transformers.git
+# ! pip install git+https://github.com/huggingface/agents.git
 """

 notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
--- a/docs/source/_redirects.yml
+++ b/docs/source/_redirects.yml
@ -1,5 +0,0 @@
-# Optimizing inference
-
-perf_infer_gpu_many: perf_infer_gpu_one
-transformers_agents: agents
-quantization: quantization/overview
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -1,984 +1,25 @@
 - sections:
  - local: index
-    title: 🤗 Transformers
+    title: 🤗 Agents
  - local: quicktour
    title: Quick tour
-  - local: installation
-    title: Installation
-  - local: add_new_model
-    title: Adding a new model to `transformers`
  title: Get started
 - sections:
-  - local: pipeline_tutorial
-    title: Run inference with pipelines
-  - local: autoclass_tutorial
-    title: Write portable code with AutoClass
-  - local: preprocessing
-    title: Preprocess data
-  - local: training
-    title: Fine-tune a pretrained model
-  - local: run_scripts
-    title: Train with a script
-  - local: accelerate
-    title: Set up distributed training with 🤗 Accelerate
-  - local: peft
-    title: Load and train adapters with 🤗 PEFT
-  - local: model_sharing
-    title: Share your model
-  - local: agents
-    title: Agents 101
-  - local: agents_advanced
-    title: Agents, supercharged - Multi-agents, External tools, and more
-  - local: llm_tutorial
-    title: Generation with LLMs
-  - local: conversations
-    title: Chatting with Transformers
+  - local: building_good_agents
+    title: Building good agents
+  - local: tools
+    title: 🛠️ Tools - in-depth guide
  title: Tutorials
 - sections:
-  - isExpanded: false
-    sections:
-    - local: tasks/sequence_classification
-      title: Text classification
-    - local: tasks/token_classification
-      title: Token classification
-    - local: tasks/question_answering
-      title: Question answering
-    - local: tasks/language_modeling
-      title: Causal language modeling
-    - local: tasks/masked_language_modeling
-      title: Masked language modeling
-    - local: tasks/translation
-      title: Translation
-    - local: tasks/summarization
-      title: Summarization
-    - local: tasks/multiple_choice
-      title: Multiple choice
-    title: Natural Language Processing
-  - isExpanded: false
-    sections:
-    - local: tasks/audio_classification
-      title: Audio classification
-    - local: tasks/asr
-      title: Automatic speech recognition
-    title: Audio
-  - isExpanded: false
-    sections:
-    - local: tasks/image_classification
-      title: Image classification
-    - local: tasks/semantic_segmentation
-      title: Image segmentation
-    - local: tasks/video_classification
-      title: Video classification
-    - local: tasks/object_detection
-      title: Object detection
-    - local: tasks/zero_shot_object_detection
-      title: Zero-shot object detection
-    - local: tasks/zero_shot_image_classification
-      title: Zero-shot image classification
-    - local: tasks/monocular_depth_estimation
-      title: Depth estimation
-    - local: tasks/image_to_image
-      title: Image-to-Image
-    - local: tasks/image_feature_extraction
-      title: Image Feature Extraction
-    - local: tasks/mask_generation
-      title: Mask Generation
-    - local: tasks/keypoint_detection
-      title: Keypoint Detection
-    - local: tasks/knowledge_distillation_for_image_classification
-      title: Knowledge Distillation for Computer Vision
-    title: Computer Vision
-  - isExpanded: false
-    sections:
-    - local: tasks/image_captioning
-      title: Image captioning
-    - local: tasks/document_question_answering
-      title: Document Question Answering
-    - local: tasks/visual_question_answering
-      title: Visual Question Answering
-    - local: tasks/text-to-speech
-      title: Text to speech
-    - local: tasks/image_text_to_text
-      title: Image-text-to-text
-    - local: tasks/video_text_to_text
-      title: Video-text-to-text
-    title: Multimodal
-  - isExpanded: false
-    sections:
-    - local: generation_strategies
-      title: Customize the generation strategy
-    - local: kv_cache
-      title: Best Practices for Generation with Cache
-    title: Generation
-  - isExpanded: false
-    sections:
-    - local: tasks/idefics
-      title: Image tasks with IDEFICS
-    - local: tasks/prompting
-      title: LLM prompting guide
-    title: Prompting
-  title: Task Guides
- sections:
-  - local: fast_tokenizers
-    title: Use fast tokenizers from 🤗 Tokenizers
-  - local: multilingual
-    title: Run inference with multilingual models
-  - local: create_a_model
-    title: Use model-specific APIs
-  - local: custom_models
-    title: Share a custom model
-  - local: chat_templating
-    title: Chat templates
-  - local: trainer
-    title: Trainer
-  - local: sagemaker
-    title: Run training on Amazon SageMaker
-  - local: serialization
-    title: Export to ONNX
-  - local: tflite
-    title: Export to TFLite
-  - local: torchscript
-    title: Export to TorchScript
-  - local: benchmarks
-    title: Benchmarks
-  - local: notebooks
-    title: Notebooks with examples
-  - local: community
-    title: Community resources
-  - local: troubleshooting
-    title: Troubleshoot
-  - local: gguf
-    title: Interoperability with GGUF files
-  - local: tiktoken
-    title: Interoperability with TikToken files
-  - local: modular_transformers
-    title: Modularity in `transformers`
-  - local: how_to_hack_models
-    title: Model Hacking (overwriting a class to your usage)
-  title: Developer guides
- sections:
-  - local: quantization/overview
-    title: Getting started
-  - local: quantization/bitsandbytes
-    title: bitsandbytes
-  - local: quantization/gptq
-    title: GPTQ
-  - local: quantization/awq
-    title: AWQ
-  - local: quantization/aqlm
-    title: AQLM
-  - local: quantization/quanto
-    title: Quanto
-  - local: quantization/eetq
-    title: EETQ
-  - local: quantization/hqq
-    title: HQQ
-  - local: quantization/fbgemm_fp8
-    title: FBGEMM_FP8
-  - local: quantization/optimum
-    title: Optimum
-  - local: quantization/torchao
-    title: TorchAO
-  - local: quantization/bitnet
-    title: BitNet
-  - local: quantization/compressed_tensors
-    title: compressed-tensors
-  - local: quantization/contribute
-    title: Contribute new quantization method
-  title: Quantization Methods
- sections:
-  - local: performance
-    title: Overview
-  - local: llm_optims
-    title: LLM inference optimization
-  - sections:
-    - local: perf_train_gpu_one
-      title: Methods and tools for efficient training on a single GPU
-    - local: perf_train_gpu_many
-      title: Multiple GPUs and parallelism
-    - local: fsdp
-      title: Fully Sharded Data Parallel
-    - local: deepspeed
-      title: DeepSpeed
-    - local: perf_train_cpu
-      title: Efficient training on CPU
-    - local: perf_train_cpu_many
-      title: Distributed CPU training
-    - local: perf_train_tpu_tf
-      title: Training on TPU with TensorFlow
-    - local: perf_train_special
-      title: PyTorch training on Apple silicon
-    - local: perf_hardware
-      title: Custom hardware for training
-    - local: hpo_train
-      title: Hyperparameter Search using Trainer API
-    title: Efficient training techniques
-  - sections:
-    - local: perf_infer_cpu
-      title: CPU inference
-    - local: perf_infer_gpu_one
-      title: GPU inference
-    - local: perf_infer_gpu_multi
-      title: Multi-GPU inference
-    title: Optimizing inference
-  - local: big_models
-    title: Instantiate a big model
-  - local: debugging
-    title: Debugging
-  - local: tf_xla
-    title: XLA Integration for TensorFlow Models
-  - local: perf_torch_compile
-    title: Optimize inference using `torch.compile()`
-  title: Performance and scalability
- sections:
-  - local: contributing
-    title: How to contribute to 🤗 Transformers?
-  - local: add_new_model
-    title: How to add a model to 🤗 Transformers?
-  - local: add_new_pipeline
-    title: How to add a pipeline to 🤗 Transformers?
-  - local: testing
-    title: Testing
-  - local: pr_checks
-    title: Checks on a Pull Request
-  title: Contribute
- sections:
-  - local: philosophy
-    title: Philosophy
-  - local: glossary
-    title: Glossary
-  - local: task_summary
-    title: What 🤗 Transformers can do
-  - local: tasks_explained
-    title: How 🤗 Transformers solve tasks
-  - local: model_summary
-    title: The Transformer model family
-  - local: tokenizer_summary
-    title: Summary of the tokenizers
-  - local: attention
-    title: Attention mechanisms
-  - local: pad_truncation
-    title: Padding and truncation
-  - local: bertology
-    title: BERTology
-  - local: perplexity
-    title: Perplexity of fixed-length models
-  - local: pipeline_webserver
-    title: Pipelines for webserver inference
-  - local: model_memory_anatomy
-    title: Model training anatomy
-  - local: llm_tutorial_optimization
-    title: Getting the most out of LLMs
+  - local: intro_agents
+    title: An introduction to agentic systems
  title: Conceptual guides
+- sections:
+  - local: text_to_sql
+    title: Text-to-SQL
+  title: Examples
 - sections:
  - sections:
    - local: main_classes/agent
      title: Agents and Tools
-    - local: model_doc/auto
-      title: Auto Classes
-    - local: main_classes/backbones
-      title: Backbones
-    - local: main_classes/callback
-      title: Callbacks
-    - local: main_classes/configuration
-      title: Configuration
-    - local: main_classes/data_collator
-      title: Data Collator
-    - local: main_classes/keras_callbacks
-      title: Keras callbacks
-    - local: main_classes/logging
-      title: Logging
-    - local: main_classes/model
-      title: Models
-    - local: main_classes/text_generation
-      title: Text Generation
-    - local: main_classes/onnx
-      title: ONNX
-    - local: main_classes/optimizer_schedules
-      title: Optimization
-    - local: main_classes/output
-      title: Model outputs
-    - local: main_classes/pipelines
-      title: Pipelines
-    - local: main_classes/processors
-      title: Processors
-    - local: main_classes/quantization
-      title: Quantization
-    - local: main_classes/tokenizer
-      title: Tokenizer
-    - local: main_classes/trainer
-      title: Trainer
-    - local: main_classes/deepspeed
-      title: DeepSpeed
-    - local: main_classes/executorch
-      title: ExecuTorch
-    - local: main_classes/feature_extractor
-      title: Feature Extractor
-    - local: main_classes/image_processor
-      title: Image Processor
    title: Main Classes
-  - sections:
-    - isExpanded: false
-      sections:
-      - local: model_doc/albert
-        title: ALBERT
-      - local: model_doc/bart
-        title: BART
-      - local: model_doc/barthez
-        title: BARThez
-      - local: model_doc/bartpho
-        title: BARTpho
-      - local: model_doc/bert
-        title: BERT
-      - local: model_doc/bert-generation
-        title: BertGeneration
-      - local: model_doc/bert-japanese
-        title: BertJapanese
-      - local: model_doc/bertweet
-        title: Bertweet
-      - local: model_doc/big_bird
-        title: BigBird
-      - local: model_doc/bigbird_pegasus
-        title: BigBirdPegasus
-      - local: model_doc/biogpt
-        title: BioGpt
-      - local: model_doc/blenderbot
-        title: Blenderbot
-      - local: model_doc/blenderbot-small
-        title: Blenderbot Small
-      - local: model_doc/bloom
-        title: BLOOM
-      - local: model_doc/bort
-        title: BORT
-      - local: model_doc/byt5
-        title: ByT5
-      - local: model_doc/camembert
-        title: CamemBERT
-      - local: model_doc/canine
-        title: CANINE
-      - local: model_doc/codegen
-        title: CodeGen
-      - local: model_doc/code_llama
-        title: CodeLlama
-      - local: model_doc/cohere
-        title: Cohere
-      - local: model_doc/convbert
-        title: ConvBERT
-      - local: model_doc/cpm
-        title: CPM
-      - local: model_doc/cpmant
-        title: CPMANT
-      - local: model_doc/ctrl
-        title: CTRL
-      - local: model_doc/dbrx
-        title: DBRX
-      - local: model_doc/deberta
-        title: DeBERTa
-      - local: model_doc/deberta-v2
-        title: DeBERTa-v2
-      - local: model_doc/dialogpt
-        title: DialoGPT
-      - local: model_doc/distilbert
-        title: DistilBERT
-      - local: model_doc/dpr
-        title: DPR
-      - local: model_doc/electra
-        title: ELECTRA
-      - local: model_doc/encoder-decoder
-        title: Encoder Decoder Models
-      - local: model_doc/ernie
-        title: ERNIE
-      - local: model_doc/ernie_m
-        title: ErnieM
-      - local: model_doc/esm
-        title: ESM
-      - local: model_doc/falcon
-        title: Falcon
-      - local: model_doc/falcon_mamba
-        title: FalconMamba
-      - local: model_doc/fastspeech2_conformer
-        title: FastSpeech2Conformer
-      - local: model_doc/flan-t5
-        title: FLAN-T5
-      - local: model_doc/flan-ul2
-        title: FLAN-UL2
-      - local: model_doc/flaubert
-        title: FlauBERT
-      - local: model_doc/fnet
-        title: FNet
-      - local: model_doc/fsmt
-        title: FSMT
-      - local: model_doc/funnel
-        title: Funnel Transformer
-      - local: model_doc/fuyu
-        title: Fuyu
-      - local: model_doc/gemma
-        title: Gemma
-      - local: model_doc/gemma2
-        title: Gemma2
-      - local: model_doc/glm
-        title: GLM
-      - local: model_doc/openai-gpt
-        title: GPT
-      - local: model_doc/gpt_neo
-        title: GPT Neo
-      - local: model_doc/gpt_neox
-        title: GPT NeoX
-      - local: model_doc/gpt_neox_japanese
-        title: GPT NeoX Japanese
-      - local: model_doc/gptj
-        title: GPT-J
-      - local: model_doc/gpt2
-        title: GPT2
-      - local: model_doc/gpt_bigcode
-        title: GPTBigCode
-      - local: model_doc/gptsan-japanese
-        title: GPTSAN Japanese
-      - local: model_doc/gpt-sw3
-        title: GPTSw3
-      - local: model_doc/granite
-        title: Granite
-      - local: model_doc/granitemoe
-        title: GraniteMoe
-      - local: model_doc/herbert
-        title: HerBERT
-      - local: model_doc/ibert
-        title: I-BERT
-      - local: model_doc/jamba
-        title: Jamba
-      - local: model_doc/jetmoe
-        title: JetMoe
-      - local: model_doc/jukebox
-        title: Jukebox
-      - local: model_doc/led
-        title: LED
-      - local: model_doc/llama
-        title: LLaMA
-      - local: model_doc/llama2
-        title: Llama2
-      - local: model_doc/llama3
-        title: Llama3
-      - local: model_doc/longformer
-        title: Longformer
-      - local: model_doc/longt5
-        title: LongT5
-      - local: model_doc/luke
-        title: LUKE
-      - local: model_doc/m2m_100
-        title: M2M100
-      - local: model_doc/madlad-400
-        title: MADLAD-400
-      - local: model_doc/mamba
-        title: Mamba
-      - local: model_doc/mamba2
-        title: mamba2
-      - local: model_doc/marian
-        title: MarianMT
-      - local: model_doc/markuplm
-        title: MarkupLM
-      - local: model_doc/mbart
-        title: MBart and MBart-50
-      - local: model_doc/mega
-        title: MEGA
-      - local: model_doc/megatron-bert
-        title: MegatronBERT
-      - local: model_doc/megatron_gpt2
-        title: MegatronGPT2
-      - local: model_doc/mistral
-        title: Mistral
-      - local: model_doc/mixtral
-        title: Mixtral
-      - local: model_doc/mluke
-        title: mLUKE
-      - local: model_doc/mobilebert
-        title: MobileBERT
-      - local: model_doc/mpnet
-        title: MPNet
-      - local: model_doc/mpt
-        title: MPT
-      - local: model_doc/mra
-        title: MRA
-      - local: model_doc/mt5
-        title: MT5
-      - local: model_doc/mvp
-        title: MVP
-      - local: model_doc/myt5
-        title: myt5
-      - local: model_doc/nemotron
-        title: Nemotron
-      - local: model_doc/nezha
-        title: NEZHA
-      - local: model_doc/nllb
-        title: NLLB
-      - local: model_doc/nllb-moe
-        title: NLLB-MoE
-      - local: model_doc/nystromformer
-        title: Nyströmformer
-      - local: model_doc/olmo
-        title: OLMo
-      - local: model_doc/olmo2
-        title: OLMo2
-      - local: model_doc/olmoe
-        title: OLMoE
-      - local: model_doc/open-llama
-        title: Open-Llama
-      - local: model_doc/opt
-        title: OPT
-      - local: model_doc/pegasus
-        title: Pegasus
-      - local: model_doc/pegasus_x
-        title: PEGASUS-X
-      - local: model_doc/persimmon
-        title: Persimmon
-      - local: model_doc/phi
-        title: Phi
-      - local: model_doc/phi3
-        title: Phi-3
-      - local: model_doc/phimoe
-        title: PhiMoE
-      - local: model_doc/phobert
-        title: PhoBERT
-      - local: model_doc/plbart
-        title: PLBart
-      - local: model_doc/prophetnet
-        title: ProphetNet
-      - local: model_doc/qdqbert
-        title: QDQBert
-      - local: model_doc/qwen2
-        title: Qwen2
-      - local: model_doc/qwen2_moe
-        title: Qwen2MoE
-      - local: model_doc/rag
-        title: RAG
-      - local: model_doc/realm
-        title: REALM
-      - local: model_doc/recurrent_gemma
-        title: RecurrentGemma
-      - local: model_doc/reformer
-        title: Reformer
-      - local: model_doc/rembert
-        title: RemBERT
-      - local: model_doc/retribert
-        title: RetriBERT
-      - local: model_doc/roberta
-        title: RoBERTa
-      - local: model_doc/roberta-prelayernorm
-        title: RoBERTa-PreLayerNorm
-      - local: model_doc/roc_bert
-        title: RoCBert
-      - local: model_doc/roformer
-        title: RoFormer
-      - local: model_doc/rwkv
-        title: RWKV
-      - local: model_doc/splinter
-        title: Splinter
-      - local: model_doc/squeezebert
-        title: SqueezeBERT
-      - local: model_doc/stablelm
-        title: StableLm
-      - local: model_doc/starcoder2
-        title: Starcoder2
-      - local: model_doc/switch_transformers
-        title: SwitchTransformers
-      - local: model_doc/t5
-        title: T5
-      - local: model_doc/t5v1.1
-        title: T5v1.1
-      - local: model_doc/tapex
-        title: TAPEX
-      - local: model_doc/transfo-xl
-        title: Transformer XL
-      - local: model_doc/ul2
-        title: UL2
-      - local: model_doc/umt5
-        title: UMT5
-      - local: model_doc/xmod
-        title: X-MOD
-      - local: model_doc/xglm
-        title: XGLM
-      - local: model_doc/xlm
-        title: XLM
-      - local: model_doc/xlm-prophetnet
-        title: XLM-ProphetNet
-      - local: model_doc/xlm-roberta
-        title: XLM-RoBERTa
-      - local: model_doc/xlm-roberta-xl
-        title: XLM-RoBERTa-XL
-      - local: model_doc/xlm-v
-        title: XLM-V
-      - local: model_doc/xlnet
-        title: XLNet
-      - local: model_doc/yoso
-        title: YOSO
-      - local: model_doc/zamba
-        title: Zamba
-      title: Text models
-    - isExpanded: false
-      sections:
-      - local: model_doc/beit
-        title: BEiT
-      - local: model_doc/bit
-        title: BiT
-      - local: model_doc/conditional_detr
-        title: Conditional DETR
-      - local: model_doc/convnext
-        title: ConvNeXT
-      - local: model_doc/convnextv2
-        title: ConvNeXTV2
-      - local: model_doc/cvt
-        title: CvT
-      - local: model_doc/deformable_detr
-        title: Deformable DETR
-      - local: model_doc/deit
-        title: DeiT
-      - local: model_doc/depth_anything
-        title: Depth Anything
-      - local: model_doc/depth_anything_v2
-        title: Depth Anything V2
-      - local: model_doc/deta
-        title: DETA
-      - local: model_doc/detr
-        title: DETR
-      - local: model_doc/dinat
-        title: DiNAT
-      - local: model_doc/dinov2
-        title: DINOV2
-      - local: model_doc/dit
-        title: DiT
-      - local: model_doc/dpt
-        title: DPT
-      - local: model_doc/efficientformer
-        title: EfficientFormer
-      - local: model_doc/efficientnet
-        title: EfficientNet
-      - local: model_doc/focalnet
-        title: FocalNet
-      - local: model_doc/glpn
-        title: GLPN
-      - local: model_doc/hiera
-        title: Hiera
-      - local: model_doc/ijepa
-        title: I-JEPA
-      - local: model_doc/imagegpt
-        title: ImageGPT
-      - local: model_doc/levit
-        title: LeViT
-      - local: model_doc/mask2former
-        title: Mask2Former
-      - local: model_doc/maskformer
-        title: MaskFormer
-      - local: model_doc/mobilenet_v1
-        title: MobileNetV1
-      - local: model_doc/mobilenet_v2
-        title: MobileNetV2
-      - local: model_doc/mobilevit
-        title: MobileViT
-      - local: model_doc/mobilevitv2
-        title: MobileViTV2
-      - local: model_doc/nat
-        title: NAT
-      - local: model_doc/poolformer
-        title: PoolFormer
-      - local: model_doc/pvt
-        title: Pyramid Vision Transformer (PVT)
-      - local: model_doc/pvt_v2
-        title: Pyramid Vision Transformer v2 (PVTv2)
-      - local: model_doc/regnet
-        title: RegNet
-      - local: model_doc/resnet
-        title: ResNet
-      - local: model_doc/rt_detr
-        title: RT-DETR
-      - local: model_doc/segformer
-        title: SegFormer
-      - local: model_doc/seggpt
-        title: SegGpt
-      - local: model_doc/superpoint
-        title: SuperPoint
-      - local: model_doc/swiftformer
-        title: SwiftFormer
-      - local: model_doc/swin
-        title: Swin Transformer
-      - local: model_doc/swinv2
-        title: Swin Transformer V2
-      - local: model_doc/swin2sr
-        title: Swin2SR
-      - local: model_doc/table-transformer
-        title: Table Transformer
-      - local: model_doc/upernet
-        title: UperNet
-      - local: model_doc/van
-        title: VAN
-      - local: model_doc/vit
-        title: Vision Transformer (ViT)
-      - local: model_doc/vit_hybrid
-        title: ViT Hybrid
-      - local: model_doc/vitdet
-        title: ViTDet
-      - local: model_doc/vit_mae
-        title: ViTMAE
-      - local: model_doc/vitmatte
-        title: ViTMatte
-      - local: model_doc/vit_msn
-        title: ViTMSN
-      - local: model_doc/yolos
-        title: YOLOS
-      - local: model_doc/zoedepth
-        title: ZoeDepth
-      title: Vision models
-    - isExpanded: false
-      sections:
-      - local: model_doc/audio-spectrogram-transformer
-        title: Audio Spectrogram Transformer
-      - local: model_doc/bark
-        title: Bark
-      - local: model_doc/clap
-        title: CLAP
-      - local: model_doc/dac
-        title: dac
-      - local: model_doc/encodec
-        title: EnCodec
-      - local: model_doc/hiera
-        title: Hiera
-      - local: model_doc/hubert
-        title: Hubert
-      - local: model_doc/mctct
-        title: MCTCT
-      - local: model_doc/mimi
-        title: Mimi
-      - local: model_doc/mms
-        title: MMS
-      - local: model_doc/moshi
-        title: Moshi
-      - local: model_doc/musicgen
-        title: MusicGen
-      - local: model_doc/musicgen_melody
-        title: MusicGen Melody
-      - local: model_doc/pop2piano
-        title: Pop2Piano
-      - local: model_doc/seamless_m4t
-        title: Seamless-M4T
-      - local: model_doc/seamless_m4t_v2
-        title: SeamlessM4T-v2
-      - local: model_doc/sew
-        title: SEW
-      - local: model_doc/sew-d
-        title: SEW-D
-      - local: model_doc/speech_to_text
-        title: Speech2Text
-      - local: model_doc/speech_to_text_2
-        title: Speech2Text2
-      - local: model_doc/speecht5
-        title: SpeechT5
-      - local: model_doc/unispeech
-        title: UniSpeech
-      - local: model_doc/unispeech-sat
-        title: UniSpeech-SAT
-      - local: model_doc/univnet
-        title: UnivNet
-      - local: model_doc/vits
-        title: VITS
-      - local: model_doc/wav2vec2
-        title: Wav2Vec2
-      - local: model_doc/wav2vec2-bert
-        title: Wav2Vec2-BERT
-      - local: model_doc/wav2vec2-conformer
-        title: Wav2Vec2-Conformer
-      - local: model_doc/wav2vec2_phoneme
-        title: Wav2Vec2Phoneme
-      - local: model_doc/wavlm
-        title: WavLM
-      - local: model_doc/whisper
-        title: Whisper
-      - local: model_doc/xls_r
-        title: XLS-R
-      - local: model_doc/xlsr_wav2vec2
-        title: XLSR-Wav2Vec2
-      title: Audio models
-    - isExpanded: false
-      sections:
-      - local: model_doc/timesformer
-        title: TimeSformer
-      - local: model_doc/videomae
-        title: VideoMAE
-      - local: model_doc/vivit
-        title: ViViT
-      title: Video models
-    - isExpanded: false
-      sections:
-      - local: model_doc/align
-        title: ALIGN
-      - local: model_doc/altclip
-        title: AltCLIP
-      - local: model_doc/aria
-        title: Aria
-      - local: model_doc/blip
-        title: BLIP
-      - local: model_doc/blip-2
-        title: BLIP-2
-      - local: model_doc/bridgetower
-        title: BridgeTower
-      - local: model_doc/bros
-        title: BROS
-      - local: model_doc/chameleon
-        title: Chameleon
-      - local: model_doc/chinese_clip
-        title: Chinese-CLIP
-      - local: model_doc/clip
-        title: CLIP
-      - local: model_doc/clipseg
-        title: CLIPSeg
-      - local: model_doc/clvp
-        title: CLVP
-      - local: model_doc/data2vec
-        title: Data2Vec
-      - local: model_doc/deplot
-        title: DePlot
-      - local: model_doc/donut
-        title: Donut
-      - local: model_doc/flava
-        title: FLAVA
-      - local: model_doc/git
-        title: GIT
-      - local: model_doc/grounding-dino
-        title: Grounding DINO
-      - local: model_doc/groupvit
-        title: GroupViT
-      - local: model_doc/idefics
-        title: IDEFICS
-      - local: model_doc/idefics2
-        title: Idefics2
-      - local: model_doc/idefics3
-        title: Idefics3
-      - local: model_doc/instructblip
-        title: InstructBLIP
-      - local: model_doc/instructblipvideo
-        title: InstructBlipVideo
-      - local: model_doc/kosmos-2
-        title: KOSMOS-2
-      - local: model_doc/layoutlm
-        title: LayoutLM
-      - local: model_doc/layoutlmv2
-        title: LayoutLMV2
-      - local: model_doc/layoutlmv3
-        title: LayoutLMV3
-      - local: model_doc/layoutxlm
-        title: LayoutXLM
-      - local: model_doc/lilt
-        title: LiLT
-      - local: model_doc/llava
-        title: Llava
-      - local: model_doc/llava_next
-        title: LLaVA-NeXT
-      - local: model_doc/llava_next_video
-        title: LLaVa-NeXT-Video
-      - local: model_doc/llava_onevision
-        title: LLaVA-Onevision
-      - local: model_doc/lxmert
-        title: LXMERT
-      - local: model_doc/matcha
-        title: MatCha
-      - local: model_doc/mgp-str
-        title: MGP-STR
-      - local: model_doc/mllama
-        title: mllama
-      - local: model_doc/nougat
-        title: Nougat
-      - local: model_doc/omdet-turbo
-        title: OmDet-Turbo
-      - local: model_doc/oneformer
-        title: OneFormer
-      - local: model_doc/owlvit
-        title: OWL-ViT
-      - local: model_doc/owlv2
-        title: OWLv2
-      - local: model_doc/paligemma
-        title: PaliGemma
-      - local: model_doc/perceiver
-        title: Perceiver
-      - local: model_doc/pix2struct
-        title: Pix2Struct
-      - local: model_doc/pixtral
-        title: Pixtral
-      - local: model_doc/qwen2_audio
-        title: Qwen2Audio
-      - local: model_doc/qwen2_vl
-        title: Qwen2VL
-      - local: model_doc/sam
-        title: Segment Anything
-      - local: model_doc/siglip
-        title: SigLIP
-      - local: model_doc/speech-encoder-decoder
-        title: Speech Encoder Decoder Models
-      - local: model_doc/tapas
-        title: TAPAS
-      - local: model_doc/trocr
-        title: TrOCR
-      - local: model_doc/tvlt
-        title: TVLT
-      - local: model_doc/tvp
-        title: TVP
-      - local: model_doc/udop
-        title: UDOP
-      - local: model_doc/video_llava
-        title: VideoLlava
-      - local: model_doc/vilt
-        title: ViLT
-      - local: model_doc/vipllava
-        title: VipLlava
-      - local: model_doc/vision-encoder-decoder
-        title: Vision Encoder Decoder Models
-      - local: model_doc/vision-text-dual-encoder
-        title: Vision Text Dual Encoder
-      - local: model_doc/visual_bert
-        title: VisualBERT
-      - local: model_doc/xclip
-        title: X-CLIP
-      title: Multimodal models
-    - isExpanded: false
-      sections:
-      - local: model_doc/decision_transformer
-        title: Decision Transformer
-      - local: model_doc/trajectory_transformer
-        title: Trajectory Transformer
-      title: Reinforcement learning models
-    - isExpanded: false
-      sections:
-      - local: model_doc/autoformer
-        title: Autoformer
-      - local: model_doc/informer
-        title: Informer
-      - local: model_doc/patchtsmixer
-        title: PatchTSMixer
-      - local: model_doc/patchtst
-        title: PatchTST
-      - local: model_doc/time_series_transformer
-        title: Time Series Transformer
-      title: Time series models
-    - isExpanded: false
-      sections:
-      - local: model_doc/graphormer
-        title: Graphormer
-      title: Graph models
-    title: Models
-  - sections:
-    - local: internal/modeling_utils
-      title: Custom Layers and Utilities
-    - local: internal/pipelines_utils
-      title: Utilities for pipelines
-    - local: internal/tokenization_utils
-      title: Utilities for Tokenizers
-    - local: internal/trainer_utils
-      title: Utilities for Trainer
-    - local: internal/generation_utils
-      title: Utilities for Generation
-    - local: internal/image_processing_utils
-      title: Utilities for Image Processors
-    - local: internal/audio_utils
-      title: Utilities for Audio processing
-    - local: internal/file_utils
-      title: General Utilities
-    - local: internal/time_series_utils
-      title: Utilities for Time Series
-    title: Internal Helpers
-  title: API
--- a/docs/source/agents.md
+++ b/docs/source/agents.md
@ -1,463 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-# Agents and tools
-
-[[open-in-colab]]
-
-### What is an agent?
-
-Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.
-
-One approach to overcome this weakness is to create an *agent*.
-
-An agent is a system that uses an LLM as its engine, and it has access to functions called *tools*.
-
-These *tools* are functions for performing a task, and they contain all necessary description for the agent to properly use them.
-
-The agent can be programmed to:
- devise a series of actions/tools and run them all at once,  like the [`CodeAgent`]
- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one, like the [`JsonAgent`]
-
-### Types of agents
-
-#### Code agent
-
-This agent has a planning step, then generates python code to execute all its actions at once. It natively handles different input and output types for its tools, thus it is the recommended choice for multimodal tasks.
-
-#### React agents
-
-This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations.
-
-We implement two versions of JsonAgent: 
- [`JsonAgent`] generates tool calls as a JSON in its output.
- [`CodeAgent`] is a new type of JsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance.
-
-> [!TIP]
-> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about ReAct agents.
-
-<div class="flex justify-center">
-    <img
-        class="block dark:hidden"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Agent_ManimCE.gif"
-    />
-    <img
-        class="hidden dark:block"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Agent_ManimCE.gif"
-    />
-</div>
-
-![Framework of a React Agent](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
-
-For example, here is how a ReAct Code agent would work its way through the following question.
-
-```py3
->>> agent.run(
-...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
-... )
-=====New task=====
-How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
-====Agent is executing the code below:
-bert_blocks = search(query="number of blocks in BERT base encoder")
-print("BERT blocks:", bert_blocks)
-====
-Print outputs:
-BERT blocks: twelve encoder blocks
-
-====Agent is executing the code below:
-attention_layer = search(query="number of layers in Attention is All You Need")
-print("Attention layers:", attention_layer)
-====
-Print outputs:
-Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
-
-====Agent is executing the code below:
-bert_blocks = 12
-attention_layers = 6
-diff = bert_blocks - attention_layers
-print("Difference in blocks:", diff)
-final_answer(diff)
-====
-
-Print outputs:
-Difference in blocks: 6
-
-Final answer: 6
-```
-
-### How can I build an agent?
-
-To initialize an agent, you need these arguments:
-
- an LLM to power your agent - the agent is not exactly the LLM, it’s more like the agent is a program that uses an LLM as its engine.
- a system prompt: what the LLM engine will be prompted with to generate its output
- a toolbox from which the agent pick tools to execute
- a parser to extract from the LLM output which tools are to call and with which arguments
-
-Upon initialization of the agent system, the tool attributes are used to generate a tool description, then baked into the agent’s `system_prompt` to let it know which tools it can use and why.
-
-To start with, please install the `agents` extras in order to install all default dependencies.
-
-```bash
-pip install transformers[agents]
-```
-
-Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.
-
-```python
-from huggingface_hub import login, InferenceClient
-
-login("<YOUR_HUGGINGFACEHUB_API_TOKEN>")
-
-client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
-
-def llm_engine(messages, stop_sequences=["Task"]) -> str:
-    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
-    answer = response.choices[0].message.content
-    return answer
-```
-
-You could use any `llm_engine` method as long as:
-1. it follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
-2. it stops generating outputs at the sequences passed in the argument `stop_sequences`
-
-Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.
-
-You will also need a `tools` argument which accepts a list of `Tools` - it can be an empty list. You can also add the default toolbox on top of your `tools` list by defining the optional argument `add_base_tools=True`.
-
-Now you can create an agent, like [`CodeAgent`], and run it. You can also create a [`TransformersEngine`] with a pre-initialized pipeline to run inference on your local machine using `transformers`.
-For convenience, since agentic behaviours generally require stronger models such as `Llama-3.1-70B-Instruct` that are harder to run locally for now, we also provide the [`HfApiEngine`] class that initializes a `huggingface_hub.InferenceClient` under the hood. 
-
-```python
-from transformers import CodeAgent, HfApiEngine
-
-llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-
-agent.run(
-    "Could you translate this sentence from French, say it out loud and return the audio.",
-    sentence="Où est la boulangerie la plus proche?",
-)
-```
-
-This will be handy in case of emergency baguette need!
-You can even leave the argument `llm_engine` undefined, and an [`HfApiEngine`] will be created by default.
-
-```python
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], add_base_tools=True)
-
-agent.run(
-    "Could you translate this sentence from French, say it out loud and give me the audio.",
-    sentence="Où est la boulangerie la plus proche?",
-)
-```
-
-Note that we used an additional `sentence` argument: you can pass text as additional arguments to the model.
-
-You can also use this to indicate the path to local or remote files for the model to use:
-
-```py
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-
-agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
-```
-
-
-The prompt and output parser were automatically defined, but you can easily inspect them by calling the `system_prompt_template` on your agent.
-
-```python
-print(agent.system_prompt_template)
-```
-
-It's important to explain as clearly as possible the task you want to perform.
-Every [`~Agent.run`] operation is independent, and since an agent is powered by an LLM, minor variations in your prompt might yield completely different results.
-You can also run an agent consecutively for different tasks: each time the attributes `agent.task` and `agent.logs` will be re-initialized.
-
-
-#### Code execution
-
-A Python interpreter executes the code on a set of inputs passed along with your tools.
-This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed.
-
-The Python interpreter also doesn't allow imports by default outside of a safe list, so all the most obvious attacks shouldn't be an issue.
-You can still authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`CodeAgent`] or [`CodeAgent`]:
-
-```py
->>> from transformers import CodeAgent
-
->>> agent = CodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
->>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
-
-(...)
-'Hugging Face – Blog'
-```
-
-The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent.
-
-> [!WARNING]
-> The LLM can generate arbitrary code that will then be executed: do not add any unsafe imports!
-
-### The system prompt
-
-An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the [`CodeAgent`] (below version is slightly simplified).
-
-```text
-You will be given a task to solve as best you can.
-You have access to the following tools:
-{{tool_descriptions}}
-
-To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
-
-At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
-Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence.
-During each intermediate step, you can use 'print()' to save whatever important information you will then need.
-These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
-
-In the end you have to return a final answer using the `final_answer` tool.
-
-Here are a few examples using notional tools:
---
-{examples}
-
-Above example were using notional tools that might not exist for you. You only have acces to those tools:
-{{tool_names}}
-You also can perform computations in the python code you generate.
-
-Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
-
-Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
-Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
-
-Remember to make sure that variables you use are all defined.
-
-Now Begin!
-```
-
-The system prompt includes:
- An *introduction* that explains how the agent should behave and what tools are.
- A description of all the tools that is defined by a `{{tool_descriptions}}` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
-    - The tool description comes from the tool attributes, `name`, `description`, `inputs` and `output_type`,  and a simple `jinja2` template that you can refine.
- The expected output format.
-
-You could improve the system prompt, for example, by adding an explanation of the output format.
-
-For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter.
-
-```python
-from transformers import JsonAgent
-from agents import PythonInterpreterTool
-
-agent = JsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
-```
-
-> [!WARNING]
-> Please make sure to define the `{{tool_descriptions}}` string somewhere in the `template` so the agent is aware 
-of the available tools.
-
-
-### Inspecting an agent run
-
-Here are a few useful attributes to inspect what happened after a run:
- `agent.logs` stores the fine-grained logs of the agent. At every step of the agent's run, everything gets stored in a dictionary that then is appended to `agent.logs`.
- Running `agent.write_inner_memory_from_logs()` creates an inner memory of the agent's logs for the LLM to view, as a list of chat messages. This method goes over each step of the log and only stores what it's interested in as a message: for instance, it will save the system prompt and task in separate messages, then for each step it will store the LLM output as a message, and the tool call output as another message. Use this if you want a higher-level view of what has happened - but not every log will be transcripted by this method.
-
-## Tools
-
-A tool is an atomic function to be used by an agent.
-
-You can for instance check the [`PythonInterpreterTool`]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action.
-
-When the agent is initialized, the tool attributes are used to generate a tool description which is baked into the agent's system prompt. This lets the agent know which tools it can use and why.
-
-### Default toolbox
-
-Transformers comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`:
-
- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
- **Translation**: translates a given sentence from source language to target language.
- **DuckDuckGo search***: performs a web search using DuckDuckGo browser.
- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [`JsonAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code
-
-
-You can manually use a tool by calling the [`load_tool`] function and a task to perform.
-
-
-```python
-from transformers import load_tool
-
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
-```
-
-
-### Create a new tool
-
-You can create your own tool for use cases not covered by the default tools from Hugging Face.
-For example, let's create a tool that returns the most downloaded model for a given task from the Hub.
-
-You'll start with the code below.
-
-```python
-from huggingface_hub import list_models
-
-task = "text-classification"
-
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
-
-This code can quickly be converted into a tool, just by wrapping it in a function and adding the `tool` decorator:
-
-
-```py
-from transformers import tool
-
-@tool
-def model_download_tool(task: str) -> str:
-    """
-    This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
-    It returns the name of the checkpoint.
-
-    Args:
-        task: The task for which
-    """
-    model = next(iter(list_models(filter="text-classification", sort="downloads", direction=-1)))
-    return model.id
-```
-
-The function needs:
- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_tool`.
- Type hints on both inputs and output
- A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint).
-All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible!
-
-> [!TIP]
-> This definition format is the same as tool schemas used in `apply_chat_template`, the only difference is the added `tool` decorator: read more on our tool use API [here](https://huggingface.co/blog/unified-tool-use#passing-tools-to-a-chat-template).
-
-Then you can directly initialize your agent:
-```py
-from transformers import CodeAgent
-agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
-agent.run(
-    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
-)
-```
-
-You get the following:
-```text
-======== New task ========
-Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
-==== Agent is executing the code below:
-most_downloaded_model = model_download_tool(task="text-to-video")
-print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
-====
-```
-
-And the output:
-`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."`
-
-## Multi-agents
-
-Multi-agent has been introduced in Microsoft's framework [Autogen](https://huggingface.co/papers/2308.08155).
-It simply means having several agents working together to solve your task instead of only one.
-It empirically yields better performance on most benchmarks. The reason for this better performance is conceptually simple: for many tasks, rather than using a do-it-all system, you would prefer to specialize units on sub-tasks. Here, having agents with separate tool sets and memories allows to achieve efficient specialization.
-
-You can easily build hierarchical multi-agent systems with `agents`.
-
-To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools.
-
-Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
-
-```py
-from agents import CodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
-
-llm_engine = HfApiEngine()
-
-web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
-
-managed_web_agent = ManagedAgent(
-    agent=web_agent,
-    name="web_search",
-    description="Runs web searches for you. Give it your query as an argument."
-)
-
-manager_agent = CodeAgent(
-    tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
-)
-
-manager_agent.run("Who is the CEO of Hugging Face?")
-```
-
-> [!TIP]
-> For an in-depth example of an efficient multi-agent implementation, see [how we pushed our multi-agent system to the top of the GAIA leaderboard](https://huggingface.co/blog/beating-gaia).
-
-
-## Display your agent run in a cool Gradio interface
-
-You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example:
-
-```py
-import gradio as gr
-from transformers import (
-    load_tool,
-    CodeAgent,
-    HfApiEngine,
-    stream_to_gradio,
-)
-
-# Import tool from Hub
-image_generation_tool = load_tool("m-ric/text-to-image")
-
-llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct")
-
-# Initialize the agent with the image generation tool
-agent = CodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
-
-
-def interact_with_agent(task):
-    messages = []
-    messages.append(gr.ChatMessage(role="user", content=task))
-    yield messages
-    for msg in stream_to_gradio(agent, task):
-        messages.append(msg)
-        yield messages + [
-            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
-        ]
-    yield messages
-
-
-with gr.Blocks() as demo:
-    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
-    submit = gr.Button("Run illustrator agent!")
-    chatbot = gr.Chatbot(
-        label="Agent",
-        type="messages",
-        avatar_images=(
-            None,
-            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
-        ),
-    )
-    submit.click(interact_with_agent, [text_input], [chatbot])
-
-if __name__ == "__main__":
-    demo.launch()
-```
--- a/docs/source/conceptual_guides/intro_agents.md
+++ b/docs/source/conceptual_guides/intro_agents.md
@ -101,4 +101,4 @@ Few existing framework build on this idea to make code agents first-class citize

 Especially, since code execution can be a security concern (arbitrary code execution!), we provide options at runtime:
 - a secure python interpreter to run code more safely in your environment
- a sandboxed `uv` environment.
+- a sandboxed environment.
--- a/docs/source/examples/text_to_sql.md
+++ b/docs/source/examples/text_to_sql.md
@ -0,0 +1,219 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+# Text-to-SQL
+
+In this tutorial, we’ll see how to implement an agent that leverages SQL using `agents`.
+
+What’s the advantage over a standard text-to-SQL pipeline?
+
+A standard text-to-sql pipeline is brittle, since the generated SQL query can be incorrect. Even worse, the query could be incorrect, but not raise an error, instead giving some incorrect/useless outputs without raising an alarm.
+
+👉 Instead, an agent system is able to critically inspect outputs and decide if the query needs to be changed or not, thus giving it a huge performance boost.
+
+Let’s build this agent! 💪
+
+### Setup text to SQL
+```py
+from sqlalchemy import (
+    create_engine,
+    MetaData,
+    Table,
+    Column,
+    String,
+    Integer,
+    Float,
+    insert,
+    inspect,
+    text,
+)
+
+engine = create_engine("sqlite:///:memory:")
+metadata_obj = MetaData()
+
+# create city SQL table
+table_name = "receipts"
+receipts = Table(
+    table_name,
+    metadata_obj,
+    Column("receipt_id", Integer, primary_key=True),
+    Column("customer_name", String(16), primary_key=True),
+    Column("price", Float),
+    Column("tip", Float),
+)
+metadata_obj.create_all(engine)
+```
+
+```py
+rows = [
+    {"receipt_id": 1, "customer_name": "Alan Payne", "price": 12.06, "tip": 1.20},
+    {"receipt_id": 2, "customer_name": "Alex Mason", "price": 23.86, "tip": 0.24},
+    {"receipt_id": 3, "customer_name": "Woodrow Wilson", "price": 53.43, "tip": 5.43},
+    {"receipt_id": 4, "customer_name": "Margaret James", "price": 21.11, "tip": 1.00},
+]
+for row in rows:
+    stmt = insert(receipts).values(**row)
+    with engine.begin() as connection:
+        cursor = connection.execute(stmt)
+```
+
+Let’s check that our system works with a basic query:
+
+```py
+with engine.connect() as con:
+    rows = con.execute(text("""SELECT * from receipts"""))
+    for row in rows:
+        print(row)
+```
+Output:
+```text
+(1, 'Alan Payne', 12.06, 1.2)
+(2, 'Alex Mason', 23.86, 0.24)
+(3, 'Woodrow Wilson', 53.43, 5.43)
+(4, 'Margaret James', 21.11, 1.0)
+```
+
+### Build our agent
+
+Now let’s make our SQL table retrievable by a tool.
+
+The tool’s description attribute will be embedded in the LLM’s prompt by the agent system: it gives the LLM information about how to use the tool. So that is where we want to describe the SQL table.
+
+```py
+inspector = inspect(engine)
+columns_info = [(col["name"], col["type"]) for col in inspector.get_columns("receipts")]
+
+table_description = "Columns:\n" + "\n".join([f"  - {name}: {col_type}" for name, col_type in columns_info])
+print(table_description)
+```
+
+```text
+Columns:
+  - receipt_id: INTEGER
+  - customer_name: VARCHAR(16)
+  - price: FLOAT
+  - tip: FLOAT
+```
+
+Now let’s build our tool. It needs the following: (read the documentation for more detail)
+- A docstring with an `Args:` part
+- Type hints
+
+```py
+from transformers.agents import tool
+
+
+
+def sql_engine(query: str) -> str:
+    """
+    Allows you to perform SQL queries on the table. Returns a string representation of the result.
+    The table is named 'receipts'. Its description is as follows:
+        Columns:
+        - receipt_id: INTEGER
+        - customer_name: VARCHAR(16)
+        - price: FLOAT
+        - tip: FLOAT
+
+    Args:
+        query: The query to perform. This should be correct SQL.
+    """
+    output = ""
+    with engine.connect() as con:
+        rows = con.execute(text(query))
+        for row in rows:
+            output += "\n" + str(row)
+    return output
+```
+
+Now let us create an agent that leverages this tool.
+
+We use the ReactCodeAgent, which is transformers.agents’ main agent class: an agent that writes actions in code and can iterate on previous output according to the ReAct framework.
+
+The llm_engine is the LLM that powers the agent system. HfEngine allows you to call LLMs using HF’s Inference API, either via Serverless or Dedicated endpoint, but you could also use any proprietary API.
+
+```py
+from transformers.agents import ReactCodeAgent, HfApiEngine
+
+agent = ReactCodeAgent(
+    tools=[sql_engine],
+    llm_engine=HfApiEngine("meta-llama/Meta-Llama-3-8B-Instruct"),
+)
+agent.run("Can you give me the name of the client who got the most expensive receipt?")
+```
+
+### Level 2: Table joins
+
+Now let’s make it more challenging! We want our agent to handle joins across multiple tables.
+
+So let’s make a second table recording the names of waiters for each receipt_id!
+
+```py
+table_name = "waiters"
+receipts = Table(
+    table_name,
+    metadata_obj,
+    Column("receipt_id", Integer, primary_key=True),
+    Column("waiter_name", String(16), primary_key=True),
+)
+metadata_obj.create_all(engine)
+
+rows = [
+    {"receipt_id": 1, "waiter_name": "Corey Johnson"},
+    {"receipt_id": 2, "waiter_name": "Michael Watts"},
+    {"receipt_id": 3, "waiter_name": "Michael Watts"},
+    {"receipt_id": 4, "waiter_name": "Margaret James"},
+]
+for row in rows:
+    stmt = insert(receipts).values(**row)
+    with engine.begin() as connection:
+        cursor = connection.execute(stmt)
+```
+We need to update the `SQLExecutorTool` with this table’s description to let the LLM properly leverage information from this table.
+
+```py
+updated_description = """Allows you to perform SQL queries on the table. Beware that this tool's output is a string representation of the execution output.
+It can use the following tables:"""
+
+inspector = inspect(engine)
+for table in ["receipts", "waiters"]:
+    columns_info = [(col["name"], col["type"]) for col in inspector.get_columns(table)]
+
+    table_description = f"Table '{table}':\n"
+
+    table_description += "Columns:\n" + "\n".join([f"  - {name}: {col_type}" for name, col_type in columns_info])
+    updated_description += "\n\n" + table_description
+
+print(updated_description)
+```
+Since this request is a bit harder than the previous one, we’ll switch the llm engine to use the more powerful [Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)!
+
+```py
+sql_engine.description = updated_description
+
+agent = ReactCodeAgent(
+    tools=[sql_engine],
+    llm_engine=HfApiEngine("Qwen/Qwen2.5-72B-Instruct"),
+)
+
+agent.run("Which waiter got more total money from tips?")
+```
+It directly works! The setup was surprisingly simple, wasn’t it?
+
+This example is done! We've touched upon these concepts:
+- building new tools
+- updating tool description
+- switching to a stronger LLM helps agent reasoning.
+
+✅ Now you can go build this text-to-SQL system you’ve always dreamt of! ✨
--- a/docs/source/main_classes/agent.md
+++ b/docs/source/main_classes/agent.md
@ -23,7 +23,7 @@ can vary as the APIs or underlying models are prone to change.

 </Tip>

-To learn more about agents and tools make sure to read the [introductory guide](../transformers_agents). This page
+To learn more about agents and tools make sure to read the [introductory guide](../index). This page
 contains the API docs for the underlying classes.

 ## Agents
@ -72,10 +72,6 @@ We provide two types of agents, based on the main [`Agent`] class:

 [[autodoc]] Toolbox

-### PipelineTool
-
-[[autodoc]] PipelineTool
-
 ### launch_gradio_demo

 [[autodoc]] launch_gradio_demo
@ -100,18 +96,16 @@ These engines have the following specification:
 For convenience, we have added a `TransformersEngine` that implements the points above, taking a pre-initialized `Pipeline` as input.

 ```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine

->>> model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
->>> model = AutoModelForCausalLM.from_pretrained(model_name)
+model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)

->>> pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

->>> engine = TransformersEngine(pipe)
->>> engine([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])
-
-"What a "
+engine = TransformersEngine(pipe)
+engine([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])
 ```

 [[autodoc]] TransformersEngine
@ -121,16 +115,17 @@ For convenience, we have added a `TransformersEngine` that implements the points
 The `HfApiEngine` is an engine that wraps an [HF Inference API](https://huggingface.co/docs/api-inference/index) client for the execution of the LLM.

 ```python
->>> from transformers import HfApiEngine
+from transformers import HfApiEngine

->>> messages = [
-...   {"role": "user", "content": "Hello, how are you?"},
-...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
-...   {"role": "user", "content": "No need to help, take it easy."},
-... ]
-
->>> HfApiEngine()(messages, stop_sequences=["conversation"])
+messages = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "No need to help, take it easy."},
+]

+HfApiEngine()(messages, stop_sequences=["conversation"])
+```
+```text
 "That's very kind of you to say! It's always nice to have a relaxed "
 ```

--- a/docs/source/main_classes/backbones.md
+++ b/docs/source/main_classes/backbones.md
@ -1,60 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Backbone
-
-A backbone is a model used for feature extraction for higher level computer vision tasks such as object detection and image classification. Transformers provides an [`AutoBackbone`] class for initializing a Transformers backbone from pretrained model weights, and two utility classes:
-
-* [`~utils.BackboneMixin`] enables initializing a backbone from Transformers or [timm](https://hf.co/docs/timm/index) and includes functions for returning the output features and indices.
-* [`~utils.BackboneConfigMixin`] sets the output features and indices of the backbone configuration.
-
-[timm](https://hf.co/docs/timm/index) models are loaded with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes.
-
-Backbones are supported for the following models:
-
-* [BEiT](../model_doc/beit)
-* [BiT](../model_doc/bit)
-* [ConvNext](../model_doc/convnext)
-* [ConvNextV2](../model_doc/convnextv2)
-* [DiNAT](../model_doc/dinat)
-* [DINOV2](../model_doc/dinov2)
-* [FocalNet](../model_doc/focalnet)
-* [MaskFormer](../model_doc/maskformer)
-* [NAT](../model_doc/nat)
-* [ResNet](../model_doc/resnet)
-* [Swin Transformer](../model_doc/swin)
-* [Swin Transformer v2](../model_doc/swinv2)
-* [ViTDet](../model_doc/vitdet)
-
-## AutoBackbone
-
-[[autodoc]] AutoBackbone
-
-## BackboneMixin
-
-[[autodoc]] utils.BackboneMixin
-
-## BackboneConfigMixin
-
-[[autodoc]] utils.BackboneConfigMixin
-
-## TimmBackbone
-
-[[autodoc]] models.timm_backbone.TimmBackbone
-
-## TimmBackboneConfig
-
-[[autodoc]] models.timm_backbone.TimmBackboneConfig
--- a/docs/source/main_classes/callback.md
+++ b/docs/source/main_classes/callback.md
@ -1,133 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Callbacks
-
-Callbacks are objects that can customize the behavior of the training loop in the PyTorch
-[`Trainer`] (this feature is not yet implemented in TensorFlow) that can inspect the training loop
-state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
-stopping).
-
-Callbacks are "read only" pieces of code, apart from the [`TrainerControl`] object they return, they
-cannot change anything in the training loop. For customizations that require changes in the training loop, you should
-subclass [`Trainer`] and override the methods you need (see [trainer](trainer) for examples).
-
-By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] will use the following callbacks.
-
- [`DefaultFlowCallback`] which handles the default behavior for logging, saving and evaluation.
- [`PrinterCallback`] or [`ProgressCallback`] to display progress and print the
-  logs (the first one is used if you deactivate tqdm through the [`TrainingArguments`], otherwise
-  it's the second one).
- [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
-  or tensorboardX).
- [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
- [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
- [`~integrations.NeptuneCallback`] if [neptune](https://neptune.ai/) is installed.
- [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
-  installed.
- [`~integrations.CodeCarbonCallback`] if [codecarbon](https://pypi.org/project/codecarbon/) is
-  installed.
- [`~integrations.ClearMLCallback`] if [clearml](https://github.com/allegroai/clearml) is installed.
- [`~integrations.DagsHubCallback`] if [dagshub](https://dagshub.com/) is installed.
- [`~integrations.FlyteCallback`] if [flyte](https://flyte.org/) is installed.
- [`~integrations.DVCLiveCallback`] if [dvclive](https://dvc.org/doc/dvclive) is installed.
-
-If a package is installed but you don't wish to use the accompanying integration, you can change `TrainingArguments.report_to` to a list of just those integrations you want to use (e.g. `["azure_ml", "wandb"]`). 
-
-The main class that implements callbacks is [`TrainerCallback`]. It gets the
-[`TrainingArguments`] used to instantiate the [`Trainer`], can access that
-Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
-[`TrainerControl`].
-
-
-## Available Callbacks
-
-Here is the list of the available [`TrainerCallback`] in the library:
-
-[[autodoc]] integrations.CometCallback
-    - setup
-
-[[autodoc]] DefaultFlowCallback
-
-[[autodoc]] PrinterCallback
-
-[[autodoc]] ProgressCallback
-
-[[autodoc]] EarlyStoppingCallback
-
-[[autodoc]] integrations.TensorBoardCallback
-
-[[autodoc]] integrations.WandbCallback
-    - setup
-
-[[autodoc]] integrations.MLflowCallback
-    - setup
-
-[[autodoc]] integrations.AzureMLCallback
-
-[[autodoc]] integrations.CodeCarbonCallback
-
-[[autodoc]] integrations.NeptuneCallback
-
-[[autodoc]] integrations.ClearMLCallback
-
-[[autodoc]] integrations.DagsHubCallback
-
-[[autodoc]] integrations.FlyteCallback
-
-[[autodoc]] integrations.DVCLiveCallback
-    - setup
-
-## TrainerCallback
-
-[[autodoc]] TrainerCallback
-
-Here is an example of how to register a custom callback with the PyTorch [`Trainer`]:
-
-```python
-class MyCallback(TrainerCallback):
-    "A callback that prints a message at the beginning of training"
-
-    def on_train_begin(self, args, state, control, **kwargs):
-        print("Starting training")
-
-
-trainer = Trainer(
-    model,
-    args,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    callbacks=[MyCallback],  # We can either pass the callback class this way or an instance of it (MyCallback())
-)
-```
-
-Another way to register a callback is to call `trainer.add_callback()` as follows:
-
-```python
-trainer = Trainer(...)
-trainer.add_callback(MyCallback)
-# Alternatively, we can pass an instance of the callback class
-trainer.add_callback(MyCallback())
-```
-
-## TrainerState
-
-[[autodoc]] TrainerState
-
-## TrainerControl
-
-[[autodoc]] TrainerControl
--- a/docs/source/main_classes/configuration.md
+++ b/docs/source/main_classes/configuration.md
@ -1,32 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Configuration
-
-The base class [`PretrainedConfig`] implements the common methods for loading/saving a configuration
-either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
-from HuggingFace's AWS S3 repository).
-
-Each derived config class implements model specific attributes. Common attributes present in all config classes are:
-`hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement:
-`vocab_size`.
-
-
-## PretrainedConfig
-
-[[autodoc]] PretrainedConfig
-    - push_to_hub
-    - all
--- a/docs/source/main_classes/data_collator.md
+++ b/docs/source/main_classes/data_collator.md
@ -1,73 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Data Collator
-
-Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
-the same type as the elements of `train_dataset` or `eval_dataset`.
-
-To be able to build batches, data collators may apply some processing (like padding). Some of them (like
-[`DataCollatorForLanguageModeling`]) also apply some random data augmentation (like random masking)
-on the formed batch.
-
-Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks).
-
-
-## Default data collator
-
-[[autodoc]] data.data_collator.default_data_collator
-
-## DefaultDataCollator
-
-[[autodoc]] data.data_collator.DefaultDataCollator
-
-## DataCollatorWithPadding
-
-[[autodoc]] data.data_collator.DataCollatorWithPadding
-
-## DataCollatorForTokenClassification
-
-[[autodoc]] data.data_collator.DataCollatorForTokenClassification
-
-## DataCollatorForSeq2Seq
-
-[[autodoc]] data.data_collator.DataCollatorForSeq2Seq
-
-## DataCollatorForLanguageModeling
-
-[[autodoc]] data.data_collator.DataCollatorForLanguageModeling
-    - numpy_mask_tokens
-    - tf_mask_tokens
-    - torch_mask_tokens
-
-## DataCollatorForWholeWordMask
-
-[[autodoc]] data.data_collator.DataCollatorForWholeWordMask
-    - numpy_mask_tokens
-    - tf_mask_tokens
-    - torch_mask_tokens
-
-## DataCollatorForPermutationLanguageModeling
-
-[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
-    - numpy_mask_tokens
-    - tf_mask_tokens
-    - torch_mask_tokens
-
-## DataCollatorWithFlattening
-
-[[autodoc]] data.data_collator.DataCollatorWithFlattening
-
--- a/docs/source/main_classes/deepspeed.md
+++ b/docs/source/main_classes/deepspeed.md
@ -1,32 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# DeepSpeed
-
-[DeepSpeed](https://github.com/microsoft/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you. 
-
-However, if you want to use DeepSpeed without the [`Trainer`], Transformers provides a [`HfDeepSpeedConfig`] class.
-
-<Tip>
-
-Learn more about using DeepSpeed with [`Trainer`] in the [DeepSpeed](../deepspeed) guide.
-
-</Tip>
-
-## HfDeepSpeedConfig
-
-[[autodoc]] integrations.HfDeepSpeedConfig
-    - all
--- a/docs/source/main_classes/executorch.md
+++ b/docs/source/main_classes/executorch.md
@ -1,33 +0,0 @@
-<!--Copyright (c) Meta Platforms, Inc. and affiliates.
-All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-
-# ExecuTorch
-
-[`ExecuTorch`](https://github.com/pytorch/executorch) is an end-to-end solution for enabling on-device inference capabilities across mobile and edge devices including wearables, embedded devices and microcontrollers. It is part of the PyTorch ecosystem and supports the deployment of PyTorch models with a focus on portability, productivity, and performance.
-
-ExecuTorch introduces well defined entry points to perform model, device, and/or use-case specific optimizations such as backend delegation, user-defined compiler transformations, memory planning, and more. The first step in preparing a PyTorch model for execution on an edge device using ExecuTorch is to export the model. This is achieved through the use of a PyTorch API called [`torch.export`](https://pytorch.org/docs/stable/export.html).
-
-
-## ExecuTorch Integration
-
-An integration point is being developed to ensure that 🤗 Transformers can be exported using `torch.export`. The goal of this integration is not only to enable export but also to ensure that the exported artifact can be further lowered and optimized to run efficiently in `ExecuTorch`, particularly for mobile and edge use cases.
-
-[[autodoc]] TorchExportableModuleWithStaticCache
-    - forward
-
-[[autodoc]] convert_and_export_with_cache
--- a/docs/source/main_classes/feature_extractor.md
+++ b/docs/source/main_classes/feature_extractor.md
@ -1,39 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Feature Extractor
-
-A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction from sequences, e.g., pre-processing audio files to generate Log-Mel Spectrogram features, feature extraction from images, e.g., cropping image files, but also padding, normalization, and conversion to NumPy, PyTorch, and TensorFlow tensors.
-
-
-## FeatureExtractionMixin
-
-[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
-    - from_pretrained
-    - save_pretrained
-
-## SequenceFeatureExtractor
-
-[[autodoc]] SequenceFeatureExtractor
-    - pad
-
-## BatchFeature
-
-[[autodoc]] BatchFeature
-
-## ImageFeatureExtractionMixin
-
-[[autodoc]] image_utils.ImageFeatureExtractionMixin
--- a/docs/source/main_classes/image_processor.md
+++ b/docs/source/main_classes/image_processor.md
@ -1,82 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Image Processor
-
-An image processor is in charge of preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch, TensorFlow, Flax and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks.
-
-Fast image processors are available for a few models and more will be added in the future. They are based on the [torchvision](https://pytorch.org/vision/stable/index.html) library and provide a significant speed-up, especially when processing on GPU.
-They have the same API as the base image processors and can be used as drop-in replacements.
-To use a fast image processor, you need to install the `torchvision` library, and set the `use_fast` argument to `True` when instantiating the image processor:
-
-```python
-from transformers import AutoImageProcessor
-
-processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True)
-```
-
-When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise.
-
-```python
-from torchvision.io import read_image
-from transformers import DetrImageProcessorFast
-
-images = read_image("image.jpg")
-processor = DetrImageProcessorFast.from_pretrained("facebook/detr-resnet-50")
-images_processed = processor(images, return_tensors="pt", device="cuda")
-```
-
-Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time:
-
-<div class="flex">
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_padded.png" />
-  </div>
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_batched_compiled.png" />
-  </div>
-</div>
-
-<div class="flex">
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_single.png" />
-  </div>
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_batched.png" />
-  </div>
-</div>
-
-These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU.
-
-
-## ImageProcessingMixin
-
-[[autodoc]] image_processing_utils.ImageProcessingMixin
-    - from_pretrained
-    - save_pretrained
-
-## BatchFeature
-
-[[autodoc]] BatchFeature
-
-## BaseImageProcessor
-
-[[autodoc]] image_processing_utils.BaseImageProcessor
-
-
-## BaseImageProcessorFast
-
-[[autodoc]] image_processing_utils_fast.BaseImageProcessorFast
--- a/docs/source/main_classes/keras_callbacks.md
+++ b/docs/source/main_classes/keras_callbacks.md
@ -1,28 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Keras callbacks
-
-When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
-tasks:
-
-## KerasMetricCallback
-
-[[autodoc]] KerasMetricCallback
-
-## PushToHubCallback
-
-[[autodoc]] PushToHubCallback
--- a/docs/source/main_classes/logging.md
+++ b/docs/source/main_classes/logging.md
@ -1,119 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Logging
-
-🤗 Transformers has a centralized logging system, so that you can setup the verbosity of the library easily.
-
-Currently the default verbosity of the library is `WARNING`.
-
-To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity
-to the INFO level.
-
-```python
-import transformers
-
-transformers.logging.set_verbosity_info()
-```
-
-You can also use the environment variable `TRANSFORMERS_VERBOSITY` to override the default verbosity. You can set it
-to one of the following: `debug`, `info`, `warning`, `error`, `critical`, `fatal`. For example:
-
-```bash
-TRANSFORMERS_VERBOSITY=error ./myprogram.py
-```
-
-Additionally, some `warnings` can be disabled by setting the environment variable
-`TRANSFORMERS_NO_ADVISORY_WARNINGS` to a true value, like *1*. This will disable any warning that is logged using
-[`logger.warning_advice`]. For example:
-
-```bash
-TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
-```
-
-Here is an example of how to use the same logger as the library in your own module or script:
-
-```python
-import logging
-
-logging.set_verbosity_info()
-logger = logging.getLogger(__name__)("transformers")
-logger.info("INFO")
-logger.warning("WARN")
-```
-
-
-All the methods of this logging module are documented below, the main ones are
-[`logging.get_verbosity`] to get the current level of verbosity in the logger and
-[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least
-verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
-
- `transformers.logging.CRITICAL` or `transformers.logging.FATAL` (int value, 50): only report the most
-  critical errors.
- `transformers.logging.ERROR` (int value, 40): only report errors.
- `transformers.logging.WARNING` or `transformers.logging.WARN` (int value, 30): only reports error and
-  warnings. This is the default level used by the library.
- `transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
- `transformers.logging.DEBUG` (int value, 10): report all information.
-
-By default, `tqdm` progress bars will be displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior.
-
-## `logging` vs `warnings`
-
-Python has two logging systems that are often used in conjunction: `logging`, which is explained above, and `warnings`,
-which allows further classification of warnings in specific buckets, e.g., `FutureWarning` for a feature or path
-that has already been deprecated and `DeprecationWarning` to indicate an upcoming deprecation.
-
-We use both in the `transformers` library. We leverage and adapt `logging`'s `captureWarnings` method to allow
-management of these warning messages by the verbosity setters above.
-
-What does that mean for developers of the library? We should respect the following heuristics:
- `warnings` should be favored for developers of the library and libraries dependent on `transformers`
- `logging` should be used for end-users of the library using it in every-day projects
-
-See reference of the `captureWarnings` method below.
-
-[[autodoc]] logging.captureWarnings
-
-## Base setters
-
-[[autodoc]] logging.set_verbosity_error
-
-[[autodoc]] logging.set_verbosity_warning
-
-[[autodoc]] logging.set_verbosity_info
-
-[[autodoc]] logging.set_verbosity_debug
-
-## Other functions
-
-[[autodoc]] logging.get_verbosity
-
-[[autodoc]] logging.set_verbosity
-
-[[autodoc]] logging.getLogger(__name__)
-
-[[autodoc]] logging.enable_default_handler
-
-[[autodoc]] logging.disable_default_handler
-
-[[autodoc]] logging.enable_explicit_format
-
-[[autodoc]] logging.reset_format
-
-[[autodoc]] logging.enable_progress_bar
-
-[[autodoc]] logging.disable_progress_bar
--- a/docs/source/main_classes/model.md
+++ b/docs/source/main_classes/model.md
@ -1,73 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Models
-
-The base classes [`PreTrainedModel`], [`TFPreTrainedModel`], and
-[`FlaxPreTrainedModel`] implement the common methods for loading/saving a model either from a local
-file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
-S3 repository).
-
-[`PreTrainedModel`] and [`TFPreTrainedModel`] also implement a few methods which
-are common among all the models to:
-
- resize the input token embeddings when new tokens are added to the vocabulary
- prune the attention heads of the model.
-
-The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`]
-(for the PyTorch models) and [`~modeling_tf_utils.TFModuleUtilsMixin`] (for the TensorFlow models) or
-for text generation, [`~generation.GenerationMixin`] (for the PyTorch models),
-[`~generation.TFGenerationMixin`] (for the TensorFlow models) and
-[`~generation.FlaxGenerationMixin`] (for the Flax/JAX models).
-
-
-## PreTrainedModel
-
-[[autodoc]] PreTrainedModel
-    - push_to_hub
-    - all
-
-Custom models should also include a `_supports_assign_param_buffer`, which determines if superfast init can apply
-on the particular model. Signs that your model needs this are if `test_save_and_load_from_pretrained` fails. If so,
-set this to `False`.
-
-## ModuleUtilsMixin
-
-[[autodoc]] modeling_utils.ModuleUtilsMixin
-
-## TFPreTrainedModel
-
-[[autodoc]] TFPreTrainedModel
-    - push_to_hub
-    - all
-
-## TFModelUtilsMixin
-
-[[autodoc]] modeling_tf_utils.TFModelUtilsMixin
-
-## FlaxPreTrainedModel
-
-[[autodoc]] FlaxPreTrainedModel
-    - push_to_hub
-    - all
-
-## Pushing to the Hub
-
-[[autodoc]] utils.PushToHubMixin
-
-## Sharded checkpoints
-
-[[autodoc]] modeling_utils.load_sharded_checkpoint
--- a/docs/source/main_classes/onnx.md
+++ b/docs/source/main_classes/onnx.md
@ -1,54 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Exporting 🤗 Transformers models to ONNX
-
-🤗 Transformers provides a `transformers.onnx` package that enables you to
-convert model checkpoints to an ONNX graph by leveraging configuration objects.
-
-See the [guide](../serialization) on exporting 🤗 Transformers models for more
-details.
-
-## ONNX Configurations
-
-We provide three abstract classes that you should inherit from, depending on the
-type of model architecture you wish to export:
-
-* Encoder-based models inherit from [`~onnx.config.OnnxConfig`]
-* Decoder-based models inherit from [`~onnx.config.OnnxConfigWithPast`]
-* Encoder-decoder models inherit from [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
-
-### OnnxConfig
-
-[[autodoc]] onnx.config.OnnxConfig
-
-### OnnxConfigWithPast
-
-[[autodoc]] onnx.config.OnnxConfigWithPast
-
-### OnnxSeq2SeqConfigWithPast
-
-[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
-
-## ONNX Features
-
-Each ONNX configuration is associated with a set of _features_ that enable you
-to export models for different types of topologies or tasks.
-
-### FeaturesManager
-
-[[autodoc]] onnx.features.FeaturesManager
-
--- a/docs/source/main_classes/optimizer_schedules.md
+++ b/docs/source/main_classes/optimizer_schedules.md
@ -1,79 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Optimization
-
-The `.optimization` module provides:
-
- an optimizer with weight decay fixed that can be used to fine-tuned models, and
- several schedules in the form of schedule objects that inherit from `_LRSchedule`:
- a gradient accumulation class to accumulate the gradients of multiple batches
-
-## AdamW (PyTorch)
-
-[[autodoc]] AdamW
-
-## AdaFactor (PyTorch)
-
-[[autodoc]] Adafactor
-
-## AdamWeightDecay (TensorFlow)
-
-[[autodoc]] AdamWeightDecay
-
-[[autodoc]] create_optimizer
-
-## Schedules
-
-### Learning Rate Schedules (PyTorch)
-
-[[autodoc]] SchedulerType
-
-[[autodoc]] get_scheduler
-
-[[autodoc]] get_constant_schedule
-
-[[autodoc]] get_constant_schedule_with_warmup
-
-<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_constant_schedule.png"/>
-
-[[autodoc]] get_cosine_schedule_with_warmup
-
-<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_schedule.png"/>
-
-[[autodoc]] get_cosine_with_hard_restarts_schedule_with_warmup
-
-<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_hard_restarts_schedule.png"/>
-
-[[autodoc]] get_linear_schedule_with_warmup
-
-<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_linear_schedule.png"/>
-
-[[autodoc]] get_polynomial_decay_schedule_with_warmup
-
-[[autodoc]] get_inverse_sqrt_schedule
-
-[[autodoc]] get_wsd_schedule
-
-### Warmup (TensorFlow)
-
-[[autodoc]] WarmUp
-
-## Gradient Strategies
-
-### GradientAccumulator (TensorFlow)
-
-[[autodoc]] GradientAccumulator
--- a/docs/source/main_classes/output.md
+++ b/docs/source/main_classes/output.md
@ -1,321 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Model outputs
-
-All models have outputs that are instances of subclasses of [`~utils.ModelOutput`]. Those are
-data structures containing all the information returned by the model, but that can also be used as tuples or
-dictionaries.
-
-Let's see how this looks in an example:
-
-```python
-from transformers import BertTokenizer, BertForSequenceClassification
-import torch
-
-tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
-
-inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-outputs = model(**inputs, labels=labels)
-```
-
-The `outputs` object is a [`~modeling_outputs.SequenceClassifierOutput`], as we can see in the
-documentation of that class below, it means it has an optional `loss`, a `logits`, an optional `hidden_states` and
-an optional `attentions` attribute. Here we have the `loss` since we passed along `labels`, but we don't have
-`hidden_states` and `attentions` because we didn't pass `output_hidden_states=True` or
-`output_attentions=True`.
-
-<Tip>
-
-When passing `output_hidden_states=True` you may expect the `outputs.hidden_states[-1]` to match `outputs.last_hidden_state` exactly.
-However, this is not always the case. Some models apply normalization or subsequent process to the last hidden state when it's returned.
-
-</Tip>
-
-
-You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
-will get `None`. Here for instance `outputs.loss` is the loss computed by the model, and `outputs.attentions` is
-`None`.
-
-When considering our `outputs` object as tuple, it only considers the attributes that don't have `None` values.
-Here for instance, it has two elements, `loss` then `logits`, so
-
-```python
-outputs[:2]
-```
-
-will return the tuple `(outputs.loss, outputs.logits)` for instance.
-
-When considering our `outputs` object as dictionary, it only considers the attributes that don't have `None`
-values. Here for instance, it has two keys that are `loss` and `logits`.
-
-We document here the generic model outputs that are used by more than one model type. Specific output types are
-documented on their corresponding model page.
-
-## ModelOutput
-
-[[autodoc]] utils.ModelOutput
-    - to_tuple
-
-## BaseModelOutput
-
-[[autodoc]] modeling_outputs.BaseModelOutput
-
-## BaseModelOutputWithPooling
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithPooling
-
-## BaseModelOutputWithCrossAttentions
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithCrossAttentions
-
-## BaseModelOutputWithPoolingAndCrossAttentions
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
-
-## BaseModelOutputWithPast
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithPast
-
-## BaseModelOutputWithPastAndCrossAttentions
-
-[[autodoc]] modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
-
-## Seq2SeqModelOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqModelOutput
-
-## CausalLMOutput
-
-[[autodoc]] modeling_outputs.CausalLMOutput
-
-## CausalLMOutputWithCrossAttentions
-
-[[autodoc]] modeling_outputs.CausalLMOutputWithCrossAttentions
-
-## CausalLMOutputWithPast
-
-[[autodoc]] modeling_outputs.CausalLMOutputWithPast
-
-## MaskedLMOutput
-
-[[autodoc]] modeling_outputs.MaskedLMOutput
-
-## Seq2SeqLMOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqLMOutput
-
-## NextSentencePredictorOutput
-
-[[autodoc]] modeling_outputs.NextSentencePredictorOutput
-
-## SequenceClassifierOutput
-
-[[autodoc]] modeling_outputs.SequenceClassifierOutput
-
-## Seq2SeqSequenceClassifierOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqSequenceClassifierOutput
-
-## MultipleChoiceModelOutput
-
-[[autodoc]] modeling_outputs.MultipleChoiceModelOutput
-
-## TokenClassifierOutput
-
-[[autodoc]] modeling_outputs.TokenClassifierOutput
-
-## QuestionAnsweringModelOutput
-
-[[autodoc]] modeling_outputs.QuestionAnsweringModelOutput
-
-## Seq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
-
-## Seq2SeqSpectrogramOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqSpectrogramOutput
-
-## SemanticSegmenterOutput
-
-[[autodoc]] modeling_outputs.SemanticSegmenterOutput
-
-## ImageClassifierOutput
-
-[[autodoc]] modeling_outputs.ImageClassifierOutput
-
-## ImageClassifierOutputWithNoAttention
-
-[[autodoc]] modeling_outputs.ImageClassifierOutputWithNoAttention
-
-## DepthEstimatorOutput
-
-[[autodoc]] modeling_outputs.DepthEstimatorOutput
-
-## Wav2Vec2BaseModelOutput
-
-[[autodoc]] modeling_outputs.Wav2Vec2BaseModelOutput
-
-## XVectorOutput
-
-[[autodoc]] modeling_outputs.XVectorOutput
-
-## Seq2SeqTSModelOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqTSModelOutput
-
-## Seq2SeqTSPredictionOutput
-
-[[autodoc]] modeling_outputs.Seq2SeqTSPredictionOutput
-
-## SampleTSPredictionOutput
-
-[[autodoc]] modeling_outputs.SampleTSPredictionOutput
-
-## TFBaseModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
-
-## TFBaseModelOutputWithPooling
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling
-
-## TFBaseModelOutputWithPoolingAndCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
-
-## TFBaseModelOutputWithPast
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast
-
-## TFBaseModelOutputWithPastAndCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
-
-## TFSeq2SeqModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput
-
-## TFCausalLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutput
-
-## TFCausalLMOutputWithCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
-
-## TFCausalLMOutputWithPast
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast
-
-## TFMaskedLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput
-
-## TFSeq2SeqLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput
-
-## TFNextSentencePredictorOutput
-
-[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput
-
-## TFSequenceClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput
-
-## TFSeq2SeqSequenceClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
-
-## TFMultipleChoiceModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput
-
-## TFTokenClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput
-
-## TFQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput
-
-## TFSeq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
-
-## FlaxBaseModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput
-
-## FlaxBaseModelOutputWithPast
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast
-
-## FlaxBaseModelOutputWithPooling
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling
-
-## FlaxBaseModelOutputWithPastAndCrossAttentions
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
-
-## FlaxSeq2SeqModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput
-
-## FlaxCausalLMOutputWithCrossAttentions
-
-[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
-
-## FlaxMaskedLMOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput
-
-## FlaxSeq2SeqLMOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput
-
-## FlaxNextSentencePredictorOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput
-
-## FlaxSequenceClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput
-
-## FlaxSeq2SeqSequenceClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
-
-## FlaxMultipleChoiceModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput
-
-## FlaxTokenClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput
-
-## FlaxQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
-
-## FlaxSeq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
--- a/docs/source/main_classes/pipelines.md
+++ b/docs/source/main_classes/pipelines.md
@ -1,501 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Pipelines
-
-The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of
-the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
-Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
-[task summary](../task_summary) for examples of use.
-
-There are two categories of pipeline abstractions to be aware about:
-
- The [`pipeline`] which is the most powerful object encapsulating all other pipelines.
- Task-specific pipelines are available for [audio](#audio), [computer vision](#computer-vision), [natural language processing](#natural-language-processing), and [multimodal](#multimodal) tasks.
-
-## The pipeline abstraction
-
-The *pipeline* abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
-pipeline but can provide additional quality of life.
-
-Simple call on one item:
-
-```python
->>> pipe = pipeline("text-classification")
->>> pipe("This restaurant is awesome")
-[{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-```
-
-If you want to use a specific model from the [hub](https://huggingface.co) you can ignore the task if the model on
-the hub already defines it:
-
-```python
->>> pipe = pipeline(model="FacebookAI/roberta-large-mnli")
->>> pipe("This restaurant is awesome")
-[{'label': 'NEUTRAL', 'score': 0.7313136458396912}]
-```
-
-To call a pipeline on many items, you can call it with a *list*.
-
-```python
->>> pipe = pipeline("text-classification")
->>> pipe(["This restaurant is awesome", "This restaurant is awful"])
-[{'label': 'POSITIVE', 'score': 0.9998743534088135},
- {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
-```
-
-To iterate over full datasets it is recommended to use a `dataset` directly. This means you don't need to allocate
-the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
-GPU. If it doesn't don't hesitate to create an issue.
-
-```python
-import datasets
-from transformers import pipeline
-from transformers.pipelines.pt_utils import KeyDataset
-from tqdm.auto import tqdm
-
-pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
-dataset = datasets.load_dataset("superb", name="asr", split="test")
-
-# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
-# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
-for out in tqdm(pipe(KeyDataset(dataset, "file"))):
-    print(out)
-    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
-    # {"text": ....}
-    # ....
-```
-
-For ease of use, a generator is also possible:
-
-
-```python
-from transformers import pipeline
-
-pipe = pipeline("text-classification")
-
-
-def data():
-    while True:
-        # This could come from a dataset, a database, a queue or HTTP request
-        # in a server
-        # Caveat: because this is iterative, you cannot use `num_workers > 1` variable
-        # to use multiple threads to preprocess data. You can still have 1 thread that
-        # does the preprocessing while the main runs the big inference
-        yield "This is a test"
-
-
-for out in pipe(data()):
-    print(out)
-    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
-    # {"text": ....}
-    # ....
-```
-
-[[autodoc]] pipeline
-
-## Pipeline batching
-
-All pipelines can use batching. This will work
-whenever the pipeline uses its streaming ability (so when passing lists or `Dataset` or `generator`).
-
-```python
-from transformers import pipeline
-from transformers.pipelines.pt_utils import KeyDataset
-import datasets
-
-dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
-pipe = pipeline("text-classification", device=0)
-for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
-    print(out)
-    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
-    # Exactly the same output as before, but the content are passed
-    # as batches to the model
-```
-
-<Tip warning={true}>
-
-However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending
-on hardware, data and the actual model being used.
-
-Example where it's mostly a speedup:
-
-</Tip>
-
-```python
-from transformers import pipeline
-from torch.utils.data import Dataset
-from tqdm.auto import tqdm
-
-pipe = pipeline("text-classification", device=0)
-
-
-class MyDataset(Dataset):
-    def __len__(self):
-        return 5000
-
-    def __getitem__(self, i):
-        return "This is a test"
-
-
-dataset = MyDataset()
-
-for batch_size in [1, 8, 64, 256]:
-    print("-" * 30)
-    print(f"Streaming batch_size={batch_size}")
-    for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
-        pass
-```
-
-```
-# On GTX 970
------------------------------
-Streaming no batching
-100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
------------------------------
-Streaming batch_size=8
-100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
------------------------------
-Streaming batch_size=64
-100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
------------------------------
-Streaming batch_size=256
-100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
-(diminishing returns, saturated the GPU)
-```
-
-Example where it's most a slowdown:
-
-```python
-class MyDataset(Dataset):
-    def __len__(self):
-        return 5000
-
-    def __getitem__(self, i):
-        if i % 64 == 0:
-            n = 100
-        else:
-            n = 1
-        return "This is a test" * n
-```
-
-This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400
-tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
-bigger batches, the program simply crashes.
-
-
-```
------------------------------
-Streaming no batching
-100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
------------------------------
-Streaming batch_size=8
-100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
------------------------------
-Streaming batch_size=64
-100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
------------------------------
-Streaming batch_size=256
-  0%|                                                                                 | 0/1000 [00:00<?, ?it/s]
-Traceback (most recent call last):
-  File "/home/nicolas/src/transformers/test.py", line 42, in <module>
-    for out in tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
-....
-    q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
-RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
-```
-
-There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of
-thumb:
-
-For users, a rule of thumb is:
-
- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the
-  only way to go.**
- If you are latency constrained (live product doing inference), don't batch.
- If you are using CPU, don't batch.
- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then:
-
-  - If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and
-    try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't
-    control the sequence_length.)
-  - If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push
-    it until you get OOMs.
-  - The larger the GPU the more likely batching is going to be more interesting
- As soon as you enable batching, make sure you can handle OOMs nicely.
-
-## Pipeline chunk batching
-
-`zero-shot-classification` and `question-answering` are slightly specific in the sense, that a single input might yield
-multiple forward pass of a model. Under normal circumstances, this would yield issues with `batch_size` argument.
-
-In order to circumvent this issue, both of these pipelines are a bit specific, they are `ChunkPipeline` instead of
-regular `Pipeline`. In short:
-
-
-```python
-preprocessed = pipe.preprocess(inputs)
-model_outputs = pipe.forward(preprocessed)
-outputs = pipe.postprocess(model_outputs)
-```
-
-Now becomes:
-
-
-```python
-all_model_outputs = []
-for preprocessed in pipe.preprocess(inputs):
-    model_outputs = pipe.forward(preprocessed)
-    all_model_outputs.append(model_outputs)
-outputs = pipe.postprocess(all_model_outputs)
-```
-
-This should be very transparent to your code because the pipelines are used in
-the same way.
-
-This is a simplified view, since the pipeline can handle automatically the batch to ! Meaning you don't have to care
-about how many forward passes you inputs are actually going to trigger, you can optimize the `batch_size`
-independently of the inputs. The caveats from the previous section still apply.
-
-## Pipeline FP16 inference
-Models can be run in FP16 which can be significantly faster on GPU while saving memory. Most models will not suffer noticeable performance loss from this. The larger the model, the less likely that it will.
-
-To enable FP16 inference, you can simply pass `torch_dtype=torch.float16` or `torch_dtype='float16'` to the pipeline constructor. Note that this only works for models with a PyTorch backend. Your inputs will be converted to FP16 internally.
-
-## Pipeline custom code
-
-If you want to override a specific pipeline.
-
-Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most
-cases, so `transformers` could maybe support your use case.
-
-
-If you want to try simply you can:
-
- Subclass your pipeline of choice
-
-```python
-class MyPipeline(TextClassificationPipeline):
-    def postprocess():
-        # Your code goes here
-        scores = scores * 100
-        # And here
-
-
-my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
-# or if you use *pipeline* function, then:
-my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
-```
-
-That should enable you to do all the custom code you want.
-
-
-## Implementing a pipeline
-
-[Implementing a new pipeline](../add_new_pipeline)
-
-## Audio
-
-Pipelines available for audio tasks include the following.
-
-### AudioClassificationPipeline
-
-[[autodoc]] AudioClassificationPipeline
-    - __call__
-    - all
-
-### AutomaticSpeechRecognitionPipeline
-
-[[autodoc]] AutomaticSpeechRecognitionPipeline
-    - __call__
-    - all
-
-### TextToAudioPipeline
-
-[[autodoc]] TextToAudioPipeline
-    - __call__
-    - all
-
-
-### ZeroShotAudioClassificationPipeline
-
-[[autodoc]] ZeroShotAudioClassificationPipeline
-    - __call__
-    - all
-
-## Computer vision
-
-Pipelines available for computer vision tasks include the following.
-
-### DepthEstimationPipeline
-[[autodoc]] DepthEstimationPipeline
-    - __call__
-    - all
-
-### ImageClassificationPipeline
-
-[[autodoc]] ImageClassificationPipeline
-    - __call__
-    - all
-
-### ImageSegmentationPipeline
-
-[[autodoc]] ImageSegmentationPipeline
-    - __call__
-    - all
-
-### ImageToImagePipeline
-
-[[autodoc]] ImageToImagePipeline
-    - __call__
-    - all
-
-### ObjectDetectionPipeline
-
-[[autodoc]] ObjectDetectionPipeline
-    - __call__
-    - all
-
-### VideoClassificationPipeline
-
-[[autodoc]] VideoClassificationPipeline
-    - __call__
-    - all
-
-### ZeroShotImageClassificationPipeline
-
-[[autodoc]] ZeroShotImageClassificationPipeline
-    - __call__
-    - all
-
-### ZeroShotObjectDetectionPipeline
-
-[[autodoc]] ZeroShotObjectDetectionPipeline
-    - __call__
-    - all
-
-## Natural Language Processing
-
-Pipelines available for natural language processing tasks include the following.
-
-### FillMaskPipeline
-
-[[autodoc]] FillMaskPipeline
-    - __call__
-    - all
-
-### QuestionAnsweringPipeline
-
-[[autodoc]] QuestionAnsweringPipeline
-    - __call__
-    - all
-
-### SummarizationPipeline
-
-[[autodoc]] SummarizationPipeline
-    - __call__
-    - all
-
-### TableQuestionAnsweringPipeline
-
-[[autodoc]] TableQuestionAnsweringPipeline
-    - __call__
-
-### TextClassificationPipeline
-
-[[autodoc]] TextClassificationPipeline
-    - __call__
-    - all
-
-### TextGenerationPipeline
-
-[[autodoc]] TextGenerationPipeline
-    - __call__
-    - all
-
-### Text2TextGenerationPipeline
-
-[[autodoc]] Text2TextGenerationPipeline
-    - __call__
-    - all
-
-### TokenClassificationPipeline
-
-[[autodoc]] TokenClassificationPipeline
-    - __call__
-    - all
-
-### TranslationPipeline
-
-[[autodoc]] TranslationPipeline
-    - __call__
-    - all
-
-### ZeroShotClassificationPipeline
-
-[[autodoc]] ZeroShotClassificationPipeline
-    - __call__
-    - all
-
-## Multimodal
-
-Pipelines available for multimodal tasks include the following.
-
-### DocumentQuestionAnsweringPipeline
-
-[[autodoc]] DocumentQuestionAnsweringPipeline
-    - __call__
-    - all
-
-### FeatureExtractionPipeline
-
-[[autodoc]] FeatureExtractionPipeline
-    - __call__
-    - all
-
-### ImageFeatureExtractionPipeline
-
-[[autodoc]] ImageFeatureExtractionPipeline
-    - __call__
-    - all
-
-### ImageToTextPipeline
-
-[[autodoc]] ImageToTextPipeline
-    - __call__
-    - all
-
-### ImageTextToTextPipeline
-
-[[autodoc]] ImageTextToTextPipeline
-    - __call__
-    - all
-
-### MaskGenerationPipeline
-
-[[autodoc]] MaskGenerationPipeline
-    - __call__
-    - all
-
-### VisualQuestionAnsweringPipeline
-
-[[autodoc]] VisualQuestionAnsweringPipeline
-    - __call__
-    - all
-
-## Parent class: `Pipeline`
-
-[[autodoc]] Pipeline
--- a/docs/source/main_classes/processors.md
+++ b/docs/source/main_classes/processors.md
@ -1,163 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Processors
-
-Processors can mean two different things in the Transformers library:
- the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text)
-  or [CLIP](../model_doc/clip) (text and vision)
- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD.
-
-## Multi-modal processors
-
-Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text,
-vision and audio). This is handled by objects called processors, which group together two or more processing objects
-such as tokenizers (for the text modality), image processors (for vision) and feature extractors (for audio).
-
-Those processors inherit from the following base class that implements the saving and loading functionality:
-
-[[autodoc]] ProcessorMixin
-
-## Deprecated processors
-
-All processors follow the same architecture which is that of the
-[`~data.processors.utils.DataProcessor`]. The processor returns a list of
-[`~data.processors.utils.InputExample`]. These
-[`~data.processors.utils.InputExample`] can be converted to
-[`~data.processors.utils.InputFeatures`] in order to be fed to the model.
-
-[[autodoc]] data.processors.utils.DataProcessor
-
-[[autodoc]] data.processors.utils.InputExample
-
-[[autodoc]] data.processors.utils.InputFeatures
-
-## GLUE
-
-[General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) is a benchmark that evaluates the
-performance of models across a diverse set of existing NLU tasks. It was released together with the paper [GLUE: A
-multi-task benchmark and analysis platform for natural language understanding](https://openreview.net/pdf?id=rJ4km2R5t7)
-
-This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB,
-QQP, QNLI, RTE and WNLI.
-
-Those processors are:
-
- [`~data.processors.utils.MrpcProcessor`]
- [`~data.processors.utils.MnliProcessor`]
- [`~data.processors.utils.MnliMismatchedProcessor`]
- [`~data.processors.utils.Sst2Processor`]
- [`~data.processors.utils.StsbProcessor`]
- [`~data.processors.utils.QqpProcessor`]
- [`~data.processors.utils.QnliProcessor`]
- [`~data.processors.utils.RteProcessor`]
- [`~data.processors.utils.WnliProcessor`]
-
-Additionally, the following method can be used to load values from a data file and convert them to a list of
-[`~data.processors.utils.InputExample`].
-
-[[autodoc]] data.processors.glue.glue_convert_examples_to_features
-
-
-## XNLI
-
-[The Cross-Lingual NLI Corpus (XNLI)](https://www.nyu.edu/projects/bowman/xnli/) is a benchmark that evaluates the
-quality of cross-lingual text representations. XNLI is crowd-sourced dataset based on [*MultiNLI*](http://www.nyu.edu/projects/bowman/multinli/): pairs of text are labeled with textual entailment annotations for 15
-different languages (including both high-resource language such as English and low-resource languages such as Swahili).
-
-It was released together with the paper [XNLI: Evaluating Cross-lingual Sentence Representations](https://arxiv.org/abs/1809.05053)
-
-This library hosts the processor to load the XNLI data:
-
- [`~data.processors.utils.XnliProcessor`]
-
-Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
-
-An example using these processors is given in the [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_xnli.py) script.
-
-
-## SQuAD
-
-[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer//) is a benchmark that
-evaluates the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version
-(v1.1) was released together with the paper [SQuAD: 100,000+ Questions for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250). The second version (v2.0) was released alongside the paper [Know What You Don't
-Know: Unanswerable Questions for SQuAD](https://arxiv.org/abs/1806.03822).
-
-This library hosts a processor for each of the two versions:
-
-### Processors
-
-Those processors are:
-
- [`~data.processors.utils.SquadV1Processor`]
- [`~data.processors.utils.SquadV2Processor`]
-
-They both inherit from the abstract class [`~data.processors.utils.SquadProcessor`]
-
-[[autodoc]] data.processors.squad.SquadProcessor
-    - all
-
-Additionally, the following method can be used to convert SQuAD examples into
-[`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
-
-[[autodoc]] data.processors.squad.squad_convert_examples_to_features
-
-
-These processors as well as the aforementioned method can be used with files containing the data as well as with the
-*tensorflow_datasets* package. Examples are given below.
-
-
-### Example usage
-
-Here is an example using the processors as well as the conversion method using data files:
-
-```python
-# Loading a V2 processor
-processor = SquadV2Processor()
-examples = processor.get_dev_examples(squad_v2_data_dir)
-
-# Loading a V1 processor
-processor = SquadV1Processor()
-examples = processor.get_dev_examples(squad_v1_data_dir)
-
-features = squad_convert_examples_to_features(
-    examples=examples,
-    tokenizer=tokenizer,
-    max_seq_length=max_seq_length,
-    doc_stride=args.doc_stride,
-    max_query_length=max_query_length,
-    is_training=not evaluate,
-)
-```
-
-Using *tensorflow_datasets* is as easy as using a data file:
-
-```python
-# tensorflow_datasets only handle Squad V1.
-tfds_examples = tfds.load("squad")
-examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
-
-features = squad_convert_examples_to_features(
-    examples=examples,
-    tokenizer=tokenizer,
-    max_seq_length=max_seq_length,
-    doc_stride=args.doc_stride,
-    max_query_length=max_query_length,
-    is_training=not evaluate,
-)
-```
-
-Another example using these processors is given in the [run_squad.py](https://github.com/huggingface/transformers/tree/main/examples/legacy/question-answering/run_squad.py) script.
--- a/docs/source/main_classes/quantization.md
+++ b/docs/source/main_classes/quantization.md
@ -1,74 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Quantization
-
-Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Transformers supports the AWQ and GPTQ quantization algorithms and it supports 8-bit and 4-bit quantization with bitsandbytes.
-
-Quantization techniques that aren't supported in Transformers can be added with the [`HfQuantizer`] class.
-
-<Tip>
-
-Learn how to quantize models in the [Quantization](../quantization) guide.
-
-</Tip>
-
-## QuantoConfig
-
-[[autodoc]] QuantoConfig
-
-## AqlmConfig
-
-[[autodoc]] AqlmConfig
-
-## AwqConfig
-
-[[autodoc]] AwqConfig
-
-## EetqConfig
-[[autodoc]] EetqConfig
-
-## GPTQConfig
-
-[[autodoc]] GPTQConfig
-
-## BitsAndBytesConfig
-
-[[autodoc]] BitsAndBytesConfig
-
-## HfQuantizer
-
-[[autodoc]] quantizers.base.HfQuantizer
-
-## HqqConfig
-
-[[autodoc]] HqqConfig
-
-## FbgemmFp8Config
-
-[[autodoc]] FbgemmFp8Config
-
-## CompressedTensorsConfig
-
-[[autodoc]] CompressedTensorsConfig
-
-## TorchAoConfig
-
-[[autodoc]] TorchAoConfig
-
-## BitNetConfig
-
-[[autodoc]] BitNetConfig
--- a/docs/source/main_classes/text_generation.md
+++ b/docs/source/main_classes/text_generation.md
@ -1,59 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Generation
-
-Each framework has a generate method for text generation implemented in their respective `GenerationMixin` class:
-
- PyTorch [`~generation.GenerationMixin.generate`] is implemented in [`~generation.GenerationMixin`].
- TensorFlow [`~generation.TFGenerationMixin.generate`] is implemented in [`~generation.TFGenerationMixin`].
- Flax/JAX [`~generation.FlaxGenerationMixin.generate`] is implemented in [`~generation.FlaxGenerationMixin`].
-
-Regardless of your framework of choice, you can parameterize the generate method with a [`~generation.GenerationConfig`]
-class instance. Please refer to this class for the complete list of generation parameters, which control the behavior
-of the generation method.
-
-To learn how to inspect a model's generation configuration, what are the defaults, how to change the parameters ad hoc,
-and how to create and save a customized generation configuration, refer to the
-[text generation strategies guide](../generation_strategies). The guide also explains how to use related features,
-like token streaming.
-
-## GenerationConfig
-
-[[autodoc]] generation.GenerationConfig
-	- from_pretrained
-	- from_model_config
-	- save_pretrained
-	- update
-	- validate
-	- get_generation_mode
-
-## GenerationMixin
-
-[[autodoc]] GenerationMixin
-	- generate
-	- compute_transition_scores
-
-## TFGenerationMixin
-
-[[autodoc]] TFGenerationMixin
-	- generate
-	- compute_transition_scores
-
-## FlaxGenerationMixin
-
-[[autodoc]] FlaxGenerationMixin
-	- generate
--- a/docs/source/main_classes/tokenizer.md
+++ b/docs/source/main_classes/tokenizer.md
@ -1,104 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Tokenizer
-
-A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most
-of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the
-Rust library [🤗 Tokenizers](https://github.com/huggingface/tokenizers). The "Fast" implementations allows:
-
-1. a significant speed-up in particular when doing batched tokenization and
-2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
-   index of the token comprising a given character or the span of characters corresponding to a given token). 
-
-The base classes [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`]
-implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
-"Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library
-(downloaded from HuggingFace's AWS S3 repository). They both rely on
-[`~tokenization_utils_base.PreTrainedTokenizerBase`] that contains the common methods, and
-[`~tokenization_utils_base.SpecialTokensMixin`].
-
-[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] thus implement the main
-methods for using all the tokenizers:
-
- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and
-  encoding/decoding (i.e., tokenizing and converting to integers).
- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece...).
- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
-  tokenizer for easy access and making sure they are not split during tokenization.
-
-[`BatchEncoding`] holds the output of the
-[`~tokenization_utils_base.PreTrainedTokenizerBase`]'s encoding methods (`__call__`,
-`encode_plus` and `batch_encode_plus`) and is derived from a Python dictionary. When the tokenizer is a pure python
-tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
-these methods (`input_ids`, `attention_mask`...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
-HuggingFace [tokenizers library](https://github.com/huggingface/tokenizers)), this class provides in addition
-several advanced alignment methods which can be used to map between the original string (character and words) and the
-token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
-to a given token).
-
-
-# Multimodal Tokenizer
-
-Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens
-as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will
-be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder. 
-
-To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not
-have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access
-to three more special tokens.  
-
-```python
-vision_tokenizer = AutoTokenizer.from_pretrained(
-    "llava-hf/llava-1.5-7b-hf",
-    extra_special_tokens={"image_token": "<image>", "boi_token": "<image_start>", "eoi_token": "<image_end>"}
-)
-print(vision_tokenizer.image_token, vision_tokenizer.image_token_id)
-("<image>", 32000)
-```
-
-## PreTrainedTokenizer
-
-[[autodoc]] PreTrainedTokenizer
-    - __call__
-    - add_tokens
-    - add_special_tokens
-    - apply_chat_template
-    - batch_decode
-    - decode
-    - encode
-    - push_to_hub
-    - all
-
-## PreTrainedTokenizerFast
-
-The [`PreTrainedTokenizerFast`] depend on the [tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 tokenizers library can be
-loaded very simply into 🤗 transformers. Take a look at the [Using tokenizers from 🤗 tokenizers](../fast_tokenizers) page to understand how this is done.
-
-[[autodoc]] PreTrainedTokenizerFast
-    - __call__
-    - add_tokens
-    - add_special_tokens
-    - apply_chat_template
-    - batch_decode
-    - decode
-    - encode
-    - push_to_hub
-    - all
-
-## BatchEncoding
-
-[[autodoc]] BatchEncoding
--- a/docs/source/main_classes/trainer.md
+++ b/docs/source/main_classes/trainer.md
@ -1,54 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Trainer
-
-The [`Trainer`] class provides an API for feature-complete training in PyTorch, and it supports distributed training on multiple GPUs/TPUs, mixed precision for [NVIDIA GPUs](https://nvidia.github.io/apex/), [AMD GPUs](https://rocm.docs.amd.com/en/latest/rocm.html), and [`torch.amp`](https://pytorch.org/docs/stable/amp.html) for PyTorch. [`Trainer`] goes hand-in-hand with the [`TrainingArguments`] class, which offers a wide range of options to customize how a model is trained. Together, these two classes provide a complete training API.
-
-[`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] inherit from the [`Trainer`] and [`TrainingArguments`] classes and they're adapted for training models for sequence-to-sequence tasks such as summarization or translation.
-
-<Tip warning={true}>
-
-The [`Trainer`] class is optimized for 🤗 Transformers models and can have surprising behaviors
-when used with other models. When using it with your own model, make sure:
-
- your model always return tuples or subclasses of [`~utils.ModelOutput`]
- your model can compute the loss if a `labels` argument is provided and that loss is returned as the first
-  element of the tuple (if your model returns tuples)
- your model can accept multiple label arguments (use `label_names` in [`TrainingArguments`] to indicate their name to the [`Trainer`]) but none of them should be named `"label"`
-
-</Tip>
-
-## Trainer[[api-reference]]
-
-[[autodoc]] Trainer
-    - all
-
-## Seq2SeqTrainer
-
-[[autodoc]] Seq2SeqTrainer
-    - evaluate
-    - predict
-
-## TrainingArguments
-
-[[autodoc]] TrainingArguments
-    - all
-
-## Seq2SeqTrainingArguments
-
-[[autodoc]] Seq2SeqTrainingArguments
-    - all
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -1,4 +1,4 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -9,181 +9,454 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
+
 -->
+# Agents and tools

-# Quicktour
+[[open-in-colab]]

-There are many ways to launch and run your code depending on your training environment ([torchrun](https://pytorch.org/docs/stable/elastic/run.html), [DeepSpeed](https://www.deepspeed.ai/), etc.) and available hardware. Accelerate offers a unified interface for launching and training on different distributed setups, allowing you to focus on your PyTorch training code instead of the intricacies of adapting your code to these different setups. This allows you to easily scale your PyTorch code for training and inference on distributed setups with hardware like GPUs and TPUs. Accelerate also provides Big Model Inference to make loading and running inference with really large models that usually don't fit in memory more accessible.
+### What is an agent?

-This quicktour introduces the three main features of Accelerate:
+Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.

-* a unified command line launching interface for distributed training scripts
-* a training library for adapting PyTorch training code to run on different distributed setups
-* Big Model Inference
+One approach to overcome this weakness is to create an *agent*.

-## Unified launch interface
+An agent is a system that uses an LLM as its engine, and it has access to functions called *tools*.

-Accelerate automatically selects the appropriate configuration values for any given distributed training framework (DeepSpeed, FSDP, etc.) through a unified configuration file generated from the [`accelerate config`](package_reference/cli#accelerate-config) command. You could also pass the configuration values explicitly to the command line which is helpful in certain situations like if you're using SLURM.
+These *tools* are functions for performing a task, and they contain all necessary description for the agent to properly use them.

+The agent can be programmed to:
+- devise a series of actions/tools and run them all at once,  like the [`CodeAgent`]
+- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one, like the [`JsonAgent`]

-But in most cases, you should always run [`accelerate config`](package_reference/cli#accelerate-config) first to help Accelerate learn about your training setup.
+### Types of agents

-```bash
-accelerate config
-```
+#### Code agent

-The [`accelerate config`](package_reference/cli#accelerate-config) command creates and saves a default_config.yaml file in Accelerates cache folder. This file stores the configuration for your training environment, which helps Accelerate correctly launch your training script based on your machine.
+This agent has a planning step, then generates python code to execute all its actions at once. It natively handles different input and output types for its tools, thus it is the recommended choice for multimodal tasks.

-After you've configured your environment, you can test your setup with [`accelerate test`](package_reference/cli#accelerate-test), which launches a short script to test the distributed environment.
+#### React agents

-```bash
-accelerate test
-```
+This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations.
+
+We implement two versions of JsonAgent: 
+- [`JsonAgent`] generates tool calls as a JSON in its output.
+- [`CodeAgent`] is a new type of JsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance.

 > [!TIP]
-> Add `--config_file` to the `accelerate test` or `accelerate launch` command to specify the location of the configuration file if it is saved in a non-default location like the cache.
+> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about ReAct agents.

-Once your environment is setup, launch your training script with [`accelerate launch`](package_reference/cli#accelerate-launch)!
+<div class="flex justify-center">
+    <img
+        class="block dark:hidden"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Agent_ManimCE.gif"
+    />
+    <img
+        class="hidden dark:block"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Agent_ManimCE.gif"
+    />
+</div>
+
+![Framework of a React Agent](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
+
+For example, here is how a ReAct Code agent would work its way through the following question.
+
+```py3
+agent.run(
+"""How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture
+proposed in Attention is All You Need?"""
+)
+```
+```text
+=====New task=====
+How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
+====Agent is executing the code below:
+bert_blocks = search(query="number of blocks in BERT base encoder")
+print("BERT blocks:", bert_blocks)
+====
+Print outputs:
+BERT blocks: twelve encoder blocks
+
+====Agent is executing the code below:
+attention_layer = search(query="number of layers in Attention is All You Need")
+print("Attention layers:", attention_layer)
+====
+Print outputs:
+Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
+
+====Agent is executing the code below:
+bert_blocks = 12
+attention_layers = 6
+diff = bert_blocks - attention_layers
+print("Difference in blocks:", diff)
+final_answer(diff)
+====
+
+Print outputs:
+Difference in blocks: 6
+
+Final answer: 6
+```
+
+### How can I build an agent?
+
+To initialize an agent, you need these arguments:
+
+- an LLM to power your agent - the agent is not exactly the LLM, it’s more like the agent is a program that uses an LLM as its engine.
+- a system prompt: what the LLM engine will be prompted with to generate its output
+- a toolbox from which the agent pick tools to execute
+- a parser to extract from the LLM output which tools are to call and with which arguments
+
+Upon initialization of the agent system, the tool attributes are used to generate a tool description, then baked into the agent’s `system_prompt` to let it know which tools it can use and why.
+
+To start with, please install the `agents` extras in order to install all default dependencies.

 ```bash
-accelerate launch path_to_script.py --args_for_the_script
+pip install agents
 ```

-To learn more, check out the [Launch distributed code](basic_tutorials/launch) tutorial for more information about launching your scripts.
-
-We also have a [configuration zoo](https://github.com/huggingface/accelerate/blob/main/examples/config_yaml_templates) which showcases a number of premade **minimal** example configurations for a variety of setups you can run.
-
-## Adapt training code
-
-The next main feature of Accelerate is the [`Accelerator`] class which adapts your PyTorch code to run on different distributed setups.
-
-You only need to add a few lines of code to your training script to enable it to run on multiple GPUs or TPUs.
-
-```diff
-+ from accelerate import Accelerator
-+ accelerator = Accelerator()
-
-+ device = accelerator.device
-+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
-+     model, optimizer, training_dataloader, scheduler
-+ )
-
-  for batch in training_dataloader:
-      optimizer.zero_grad()
-      inputs, targets = batch
-     inputs = inputs.to(device)
-     targets = targets.to(device)
-      outputs = model(inputs)
-      loss = loss_function(outputs, targets)
-+     accelerator.backward(loss)
-      optimizer.step()
-      scheduler.step()
-```
-
-1. Import and instantiate the [`Accelerator`] class at the beginning of your training script. The [`Accelerator`] class initializes everything necessary for distributed training, and it automatically detects your training environment (a single machine with a GPU, a machine with several GPUs, several machines with multiple GPUs or a TPU, etc.) based on how the code was launched.
+Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.

 ```python
-from accelerate import Accelerator
+from huggingface_hub import login, InferenceClient

-accelerator = Accelerator()
+login("<YOUR_HUGGINGFACEHUB_API_TOKEN>")
+
+model_id = "Qwen/Qwen2.5-72B-Instruct"
+
+client = InferenceClient(model=model_id)
+
+def llm_engine(messages, stop_sequences=["Task"]) -> str:
+    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
+    answer = response.choices[0].message.content
+    return answer
 ```

-2. Remove calls like `.cuda()` on your model and input data. The [`Accelerator`] class automatically places these objects on the appropriate device for you.
+You could use any `llm_engine` method as long as:
+1. it follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
+2. it stops generating outputs at the sequences passed in the argument `stop_sequences`

-> [!WARNING]
-> This step is *optional* but it is considered best practice to allow Accelerate to handle device placement. You could also deactivate automatic device placement by passing `device_placement=False` when initializing the [`Accelerator`]. If you want to explicitly place objects on a device with `.to(device)`, make sure you use `accelerator.device` instead. For example, if you create an optimizer before placing a model on `accelerator.device`, training fails on a TPU.
+Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.

-> [!WARNING]
-> Accelerate does not use non-blocking transfers by default for its automatic device placement, which can result in potentially unwanted CUDA synchronizations.  You can enable non-blocking transfers by passing a [`~utils.dataclasses.DataLoaderConfiguration`] with `non_blocking=True` set as the `dataloader_config` when initializing the [`Accelerator`].  As usual, non-blocking transfers will only work if the dataloader also has `pin_memory=True` set.  Be wary that using non-blocking transfers from GPU to CPU may cause incorrect results if it results in CPU operations being performed on non-ready tensors.
+You will also need a `tools` argument which accepts a list of `Tools` - it can be an empty list. You can also add the default toolbox on top of your `tools` list by defining the optional argument `add_base_tools=True`.

-```py
-device = accelerator.device
-```
-
-3. Pass all relevant PyTorch objects for training (optimizer, model, dataloader(s), learning rate scheduler) to the [`~Accelerator.prepare`] method as soon as they're created. This method wraps the model in a container optimized for your distributed setup, uses Accelerates version of the optimizer and scheduler, and creates a sharded version of your dataloader for distribution across GPUs or TPUs.
+Now you can create an agent, like [`CodeAgent`], and run it. You can also create a [`TransformersEngine`] with a pre-initialized pipeline to run inference on your local machine using `transformers`.
+For convenience, since agentic behaviours generally require strong models that are harder to run locally for now, we also provide the [`HfApiEngine`] class that initializes a `huggingface_hub.InferenceClient` under the hood. 

 ```python
-model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-    model, optimizer, train_dataloader, lr_scheduler
+from agents import CodeAgent, HfApiEngine
+
+llm_engine = HfApiEngine(model=model_id)
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run(
+    "Could you translate this sentence from French, say it out loud and return the audio.",
+    sentence="Où est la boulangerie la plus proche?",
 )
 ```

-4. Replace `loss.backward()` with [`~Accelerator.backward`] to use the correct `backward()` method for your training setup.
-
-```py
-accelerator.backward(loss)
-```
-
-Read [Accelerate’s internal mechanisms](concept_guides/internal_mechanism) guide to learn more details about how Accelerate adapts your code.
-
-### Distributed evaluation
-
-To perform distributed evaluation, pass your validation dataloader to the [`~Accelerator.prepare`] method:
+This will be handy in case of emergency baguette need!
+You can even leave the argument `llm_engine` undefined, and an [`HfApiEngine`] will be created by default.

 ```python
-validation_dataloader = accelerator.prepare(validation_dataloader)
-```
+from agents import CodeAgent

-Each device in your distributed setup only receives a part of the evaluation data, which means you should group your predictions together with the [`~Accelerator.gather_for_metrics`] method. This method requires all tensors to be the same size on each process, so if your tensors have different sizes on each process (for instance when dynamically padding to the maximum length in a batch), you should use the [`~Accelerator.pad_across_processes`] method to pad you tensor to the largest size across processes. Note that the tensors needs to be 1D and that we concatenate the tensors along the first dimension. 
+agent = CodeAgent(tools=[], add_base_tools=True)

-```python
-for inputs, targets in validation_dataloader:
-    predictions = model(inputs)
-    # Gather all predictions and targets
-    all_predictions, all_targets = accelerator.gather_for_metrics((predictions, targets))
-    # Example of use with a *Datasets.Metric*
-    metric.add_batch(all_predictions, all_targets)
-```
-
-For more complex cases (e.g. 2D tensors, don't want to concatenate tensors, dict of 3D tensors), you can pass `use_gather_object=True` in `gather_for_metrics`. This will return the list of objects after gathering. Note that using it with GPU tensors is not well supported and inefficient.
-
-> [!TIP]
-> Data at the end of a dataset may be duplicated so the batch can be equally divided among all workers. The [`~Accelerator.gather_for_metrics`] method automatically removes the duplicated data to calculate a more accurate metric.
-
-## Big Model Inference
-
-Accelerate's Big Model Inference has two main features, [`~accelerate.init_empty_weights`] and [`~accelerate.load_checkpoint_and_dispatch`], to load large models for inference that typically don't fit into memory.
-
-> [!TIP]
-> Take a look at the [Handling big models for inference](concept_guides/big_model_inference) guide for a better understanding of how Big Model Inference works under the hood.
-
-### Empty weights initialization
-
-The [`~accelerate.init_empty_weights`] context manager initializes models of any size by creating a *model skeleton* and moving and placing parameters each time they're created to PyTorch's [**meta**](https://pytorch.org/docs/main/meta.html) device. This way, not all weights are immediately loaded and only a small part of the model is loaded into memory at a time.
-
-For example, loading an empty [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model takes significantly less memory than fully loading the models and weights on the CPU.
-
-```py
-from accelerate import init_empty_weights
-from transformers import AutoConfig, AutoModelForCausalLM
-
-config = AutoConfig.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
-with init_empty_weights():
-    model = AutoModelForCausalLM.from_config(config)
-```
-
-### Load and dispatch weights
-
-The [`~accelerate.load_checkpoint_and_dispatch`] function loads full or sharded checkpoints into the empty model, and automatically distribute weights across all available devices.
-
-The `device_map` parameter determines where to place each model layer, and specifiying `"auto"` places them on the GPU first, then the CPU, and finally the hard drive as memory-mapped tensors if there's still not enough memory. Use the `no_split_module_classes` parameter to indicate which modules shouldn't be split across devices (typically those with a residual connection).
-
-```py
-from accelerate import load_checkpoint_and_dispatch
-
-model_checkpoint = "your-local-model-folder"
-model = load_checkpoint_and_dispatch(
-    model, checkpoint=model_checkpoint, device_map="auto", no_split_module_classes=['Block']
+agent.run(
+    "Could you translate this sentence from French, say it out loud and give me the audio.",
+    sentence="Où est la boulangerie la plus proche?",
 )
 ```

-## Next steps
+Note that we used an additional `sentence` argument: you can pass text as additional arguments to the model.

-Now that you've been introduced to the main Accelerate features, your next steps could include:
+You can also use this to indicate the path to local or remote files for the model to use:

-* Check out the [tutorials](basic_tutorials/overview) for a gentle walkthrough of Accelerate. This is especially useful if you're new to distributed training and the library.
-* Dive into the [guides](usage_guides/explore) to see how to use Accelerate for specific use-cases.
-* Deepen your conceptual understanding of how Accelerate works internally by reading the [concept guides](concept_guides/internal_mechanism).
-* Look up classes and commands in the [API reference](package_reference/accelerator) to see what parameters and options are available.
+```py
+from agents import CodeAgent
+
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
+```
+
+
+The prompt and output parser were automatically defined, but you can easily inspect them by calling the `system_prompt_template` on your agent.
+
+```python
+print(agent.system_prompt_template)
+```
+
+It's important to explain as clearly as possible the task you want to perform.
+Every [`~Agent.run`] operation is independent, and since an agent is powered by an LLM, minor variations in your prompt might yield completely different results.
+You can also run an agent consecutively for different tasks: each time the attributes `agent.task` and `agent.logs` will be re-initialized.
+
+
+#### Code execution
+
+A Python interpreter executes the code on a set of inputs passed along with your tools.
+This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed.
+
+The Python interpreter also doesn't allow imports by default outside of a safe list, so all the most obvious attacks shouldn't be an issue.
+You can still authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`CodeAgent`] or [`CodeAgent`]:
+
+```py
+from agents import CodeAgent
+
+agent = CodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
+agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
+```
+This gives you at the end of the agent run:
+```text
+'Hugging Face – Blog'
+```
+The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent.
+
+> [!WARNING]
+> The LLM can generate arbitrary code that will then be executed: do not add any unsafe imports!
+
+### The system prompt
+
+An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the [`CodeAgent`] (below version is slightly simplified).
+
+```text
+You will be given a task to solve as best you can.
+You have access to the following tools:
+{{tool_descriptions}}
+
+To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
+Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence.
+During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
+
+In the end you have to return a final answer using the `final_answer` tool.
+
+Here are a few examples using notional tools:
+---
+{examples}
+
+Above example were using notional tools that might not exist for you. You only have acces to those tools:
+{{tool_names}}
+You also can perform computations in the python code you generate.
+
+Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
+
+Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
+Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
+
+Remember to make sure that variables you use are all defined.
+
+Now Begin!
+```
+
+The system prompt includes:
+- An *introduction* that explains how the agent should behave and what tools are.
+- A description of all the tools that is defined by a `{{tool_descriptions}}` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
+    - The tool description comes from the tool attributes, `name`, `description`, `inputs` and `output_type`,  and a simple `jinja2` template that you can refine.
+- The expected output format.
+
+You could improve the system prompt, for example, by adding an explanation of the output format.
+
+For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter.
+
+```python
+from agents import JsonAgent, PythonInterpreterTool, JSON_SYSTEM_PROMPT
+
+modified_prompt = JSON_SYSTEM_PROMPT
+
+agent = JsonAgent(tools=[PythonInterpreterTool()], system_prompt=modified_prompt)
+```
+
+> [!WARNING]
+> Please make sure to define the `{{tool_descriptions}}` string somewhere in the `template` so the agent is aware 
+of the available tools.
+
+
+### Inspecting an agent run
+
+Here are a few useful attributes to inspect what happened after a run:
+- `agent.logs` stores the fine-grained logs of the agent. At every step of the agent's run, everything gets stored in a dictionary that then is appended to `agent.logs`.
+- Running `agent.write_inner_memory_from_logs()` creates an inner memory of the agent's logs for the LLM to view, as a list of chat messages. This method goes over each step of the log and only stores what it's interested in as a message: for instance, it will save the system prompt and task in separate messages, then for each step it will store the LLM output as a message, and the tool call output as another message. Use this if you want a higher-level view of what has happened - but not every log will be transcripted by this method.
+
+## Tools
+
+A tool is an atomic function to be used by an agent.
+
+You can for instance check the [`PythonInterpreterTool`]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action.
+
+When the agent is initialized, the tool attributes are used to generate a tool description which is baked into the agent's system prompt. This lets the agent know which tools it can use and why.
+
+### Default toolbox
+
+Transformers comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`:
+
+- **DuckDuckGo web search***: performs a web search using DuckDuckGo browser.
+- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [`JsonAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code
+
+You can manually use a tool by calling the [`load_tool`] function and a task to perform.
+
+```python
+from transformers import load_tool
+
+search_tool = load_tool("web_search")
+print(search_tool("Who's the current president of Russia?"))
+```
+
+
+### Create a new tool
+
+You can create your own tool for use cases not covered by the default tools from Hugging Face.
+For example, let's create a tool that returns the most downloaded model for a given task from the Hub.
+
+You'll start with the code below.
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+This code can quickly be converted into a tool, just by wrapping it in a function and adding the `tool` decorator:
+
+
+```py
+from transformers import tool
+
+@tool
+def model_download_tool(task: str) -> str:
+    """
+    This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
+    It returns the name of the checkpoint.
+
+    Args:
+        task: The task for which
+    """
+    model = next(iter(list_models(filter="text-classification", sort="downloads", direction=-1)))
+    return model.id
+```
+
+The function needs:
+- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_tool`.
+- Type hints on both inputs and output
+- A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint).
+All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible!
+
+> [!TIP]
+> This definition format is the same as tool schemas used in `apply_chat_template`, the only difference is the added `tool` decorator: read more on our tool use API [here](https://huggingface.co/blog/unified-tool-use#passing-tools-to-a-chat-template).
+
+Then you can directly initialize your agent:
+```py
+from agents import CodeAgent
+agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
+agent.run(
+    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+
+You get the following:
+```text
+======== New task ========
+Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
+==== Agent is executing the code below:
+most_downloaded_model = model_download_tool(task="text-to-video")
+print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
+====
+```
+
+And the output:
+`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."`
+
+## Multi-agents
+
+Multi-agent has been introduced in Microsoft's framework [Autogen](https://huggingface.co/papers/2308.08155).
+It simply means having several agents working together to solve your task instead of only one.
+It empirically yields better performance on most benchmarks. The reason for this better performance is conceptually simple: for many tasks, rather than using a do-it-all system, you would prefer to specialize units on sub-tasks. Here, having agents with separate tool sets and memories allows to achieve efficient specialization.
+
+You can easily build hierarchical multi-agent systems with `agents`.
+
+To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools.
+
+Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
+
+```py
+from agents import CodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
+
+llm_engine = HfApiEngine()
+
+web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
+
+managed_web_agent = ManagedAgent(
+    agent=web_agent,
+    name="web_search",
+    description="Runs web searches for you. Give it your query as an argument."
+)
+
+manager_agent = CodeAgent(
+    tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
+)
+
+manager_agent.run("Who is the CEO of Hugging Face?")
+```
+
+> [!TIP]
+> For an in-depth example of an efficient multi-agent implementation, see [how we pushed our multi-agent system to the top of the GAIA leaderboard](https://huggingface.co/blog/beating-gaia).
+
+
+## Display your agent run in a cool Gradio interface
+
+You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example:
+
+```py
+import gradio as gr
+from transformers import (
+    load_tool,
+    CodeAgent,
+    HfApiEngine,
+    stream_to_gradio,
+)
+
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image")
+
+llm_engine = HfApiEngine(model_id)
+
+# Initialize the agent with the image generation tool
+agent = CodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
+
+
+def interact_with_agent(task):
+    messages = []
+    messages.append(gr.ChatMessage(role="user", content=task))
+    yield messages
+    for msg in stream_to_gradio(agent, task):
+        messages.append(msg)
+        yield messages + [
+            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
+        ]
+    yield messages
+
+
+with gr.Blocks() as demo:
+    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
+    submit = gr.Button("Run illustrator agent!")
+    chatbot = gr.Chatbot(
+        label="Agent",
+        type="messages",
+        avatar_images=(
+            None,
+            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+        ),
+    )
+    submit.click(interact_with_agent, [text_input], [chatbot])
+
+if __name__ == "__main__":
+    demo.launch()
+```
--- a/docs/source/tutorials/building_good_agents.md
+++ b/docs/source/tutorials/building_good_agents.md
@ -15,7 +15,10 @@ rendered properly in your Markdown viewer.
 -->
 # Building good agents

-There's a world of difference between building an agent that works and one that doesn't. How to build into this latter category?
+[[open-in-colab]]
+
+There's a world of difference between building an agent that works and one that doesn't.
+How to build into this latter category?
 In this guide, we're going to see best practices for building agents.

 > [!TIP]
@ -52,12 +55,19 @@ For instance, here's a tool that :

 First, here's a poor version:
 ```python
-from my_weather_api return convert_location_to_coordinates, get_weather_report_at_coordinates
-# Let's say "get_weather_report_at_coordinates" returns a list of [temperature in °C, risk of rain on a scale 0-1, wave height in m]
 import datetime
+from agents import tool
+
+def get_weather_report_at_coordinates(coordinates, date_time):
+    # Dummy function, returns a list of [temperature in °C, risk of rain on a scale 0-1, wave height in m]
+    return [28.0, 0.35, 0.85]
+
+def get_coordinates_from_location(location):
+    # Returns dummy coordinates
+    return [3.3, -42.0]

@tool
-def get_weather_api(location (str), date_time: str) -> str:
+def get_weather_api(location: str, date_time: str) -> str:
    """
    Returns the weather report.

@ -80,12 +90,8 @@ If the tool call fails, the error trace logged in memory can help the LLM revers

 A better way to build this tool would have been the following:
 ```python
-from my_weather_api return convert_location_to_coordinates, get_weather_report_at_coordinates
-# Let's say "get_weather_report_at_coordinates" returns a list of [temperature in °C, risk of rain on a scale 0-1, wave height in m]
-import datetime
-
@tool
-def get_weather_api(location (str), date_time: str) -> str:
+def get_weather_api(location: str, date_time: str) -> str:
    """
    Returns the weather report.

@ -154,3 +160,27 @@ Better ways to guide your LLM engine are:
 ### 3. Extra planning

 We provide a model for a supplementary planning step, that an agent can run regularly in-between normal action steps. In this step, there is no tool call, the LLM is simply asked to update a list of facts it knows and to reflect on what steps it should take next based on those facts.
+
+```py
+from agents import load_tool, CodeAgent, HfApiEngine, DuckDuckGoSearchTool
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image", cache=False)
+
+search_tool = DuckDuckGoSearchTool()
+
+agent = CodeAgent(
+    tools=[search_tool],
+    llm_engine=HfApiEngine("Qwen/Qwen2.5-72B-Instruct"),
+    planning_interval=3 # This is where you activate planning!
+)
+
+# Run it!
+result = agent.run(
+    "How long would a cheetah at full speed take to run the length of Pont Alexandre III?",
+)
+print("RESULT:", result)
+```
--- a/docs/source/tutorials/tools.md
+++ b/docs/source/tutorials/tools.md
@ -41,7 +41,7 @@ The types for both `inputs` and `output_type` should be amongst [Pydantic format
 Also, all imports should be put within the tool's forward function, else you will get an error.

 ```python
-from transformers import Tool
+from agents import Tool

 class HFModelDownloadsTool(Tool):
    name = "model_download_counter"
@ -70,20 +70,16 @@ Now the custom `HfModelDownloadsTool` class is ready.
 You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.

 ```python
-from dotenv import load_dotenv
-
-load_dotenv()
-
-tool.push_to_hub("m-ric/hf-model-downloads", token=os.getenv("HF_TOKEN"))
+tool.push_to_hub("m-ric/hf-model-downloads", token="<YOUR_HUGGINGFACEHUB_API_TOKEN>")
 ```

 Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent.
 Since running tools means running custom code, you need to make sure you trust the repository, and pass `trust_remote_code=True`.

 ```python
-from transformers import load_tool, CodeAgent
+from agents import load_tool, CodeAgent

-model_download_tool = load_tool("m-ric/hf-model-downloads")
+model_download_tool = load_tool("m-ric/hf-model-downloads", trust_remote_code=True)
 ```

 ### Import a Space as a tool 🚀
@ -95,10 +91,8 @@ You only need to provide the id of the Space on the Hub, its name, and a descrip
 For instance, let's import the [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) Space from the Hub and use it to generate an image.

 ```python
-from transformers import Tool
-
 image_generation_tool = Tool.from_space(
-    "black-forest-labs/FLUX.1-dev",
+    "black-forest-labs/FLUX.1-schnell",
    name="image_generator",
    description="Generate an image from a prompt")

@ -111,9 +105,10 @@ And voilà, here's your image! 🏖️
 Then you can use this tool just like any other tool.  For example, let's improve the prompt  `a rabbit wearing a space suit` and generate an image of it.

 ```python
-from transformers import CodeAgent
+from agents import CodeAgent, HfApiEngine

-agent = CodeAgent(tools=[image_generation_tool])
+llm_engine = HfApiEngine("Qwen/Qwen2.5-Coder-32B-Instruct")
+agent = CodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)

 agent.run(
    "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
@ -145,7 +140,6 @@ Import and instantiate the tool, then pass it to the `Tool.from_gradio` method:

 ```python
 from gradio_tools import StableDiffusionPromptGeneratorTool
-from agents import Tool, load_tool, CodeAgent

 gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
 prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
@ -163,11 +157,10 @@ Here is how you can use it to recreate the intro's search result using a LangCha
 This tool will need `pip install langchain google-search-results -q` to work properly.
 ```python
 from langchain.agents import load_tools
-from agents import Tool, CodeAgent

 search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])

-agent = CodeAgent(tools=[search_tool])
+agent = CodeAgent(tools=[search_tool], llm_engine=llm_engine)

 agent.run("How many more blocks (also denoted as layers) are in BERT base encoder compared to the encoder from the architecture proposed in Attention is All You Need?")
 ```
@ -179,23 +172,21 @@ You can manage an agent's toolbox by adding or replacing a tool.
 Let's add the `model_download_tool` to an existing agent initialized with only the default toolbox.

 ```python
-from transformers import CodeAgent
+from agents import HfApiEngine
+
+llm_engine = HfApiEngine("Qwen/Qwen2.5-Coder-32B-Instruct")

 agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
 agent.toolbox.add_tool(model_download_tool)
 ```
-Now we can leverage both the new tool and the previous text-to-speech tool:
+Now we can leverage the new tool:

 ```python
 agent.run(
-    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub but reverse the letters?"
 )
 ```

-| **Audio**                                                                                                                                            |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
-

 > [!WARNING]
 > Beware when adding tools to an agent that already works well because it can bias selection towards your tool or select another tool other than the one already defined.
@ -214,8 +205,11 @@ Then pass them as a list to initialize you agent, and start using them!
 ```py
 from transformers import ToolCollection, CodeAgent

-image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f")
-agent = CodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
+image_tool_collection = ToolCollection(
+    collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f",
+    token="<YOUR_HUGGINGFACEHUB_API_TOKEN>"
+)
+agent = CodeAgent(tools=[*image_tool_collection.tools], llm_engine=llm_engine, add_base_tools=True)

 agent.run("Please draw me a picture of rivers and lakes.")
 ```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -26,4 +26,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
    "anthropic",
+]
+test = [
+    "gradio-tools"
 ]
--- a/src/agents/agents.py
+++ b/src/agents/agents.py
@ -151,6 +151,7 @@ def format_prompt_with_managed_agents_descriptions(
    if agent_descriptions_placeholder is None:
        agent_descriptions_placeholder = "{{managed_agents_descriptions}}"
    if agent_descriptions_placeholder not in prompt_template:
+        print("PROMPT TEMPLLL", prompt_template)
        raise ValueError(f"Provided prompt template does not contain the managed agents descriptions placeholder '{agent_descriptions_placeholder}'")
    if len(managed_agents.keys()) > 0:
        return prompt_template.replace(
--- a/src/agents/default_tools.py
+++ b/src/agents/default_tools.py
@ -147,13 +147,15 @@ class PythonInterpreterTool(Tool):
                ),
            }
        }
+        self.base_python_tool = BASE_PYTHON_TOOLS
+        self.python_evaluator = evaluate_python_code
        super().__init__(*args, **kwargs)

    def forward(self, code: str) -> str:
        output = str(
-            evaluate_python_code(
+            self.python_evaluator(
                code,
-                static_tools=BASE_PYTHON_TOOLS,
+                static_tools=self.base_python_tool,
                authorized_imports=self.authorized_imports,
            )
        )
--- a/src/agents/tools.py
+++ b/src/agents/tools.py
@ -16,16 +16,18 @@
 # limitations under the License.
 import ast
 import base64
+import builtins
 import importlib
 import inspect
 import io
 import json
 import os
+import re
 import tempfile
 import textwrap
 from functools import lru_cache, wraps
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union, Set

 from huggingface_hub import (
    create_repo,
@ -92,11 +94,10 @@ def get_repo_type(repo_id, repo_type=None, **hub_kwargs):

 def setup_default_tools():
    default_tools = {}
-    main_module = importlib.import_module("transformers")
-    tools_module = main_module.agents
+    main_module = importlib.import_module("agents")

    for task_name, tool_class_name in TOOL_MAPPING.items():
-        tool_class = getattr(tools_module, tool_class_name)
+        tool_class = getattr(main_module, tool_class_name)
        tool_instance = tool_class()
        default_tools[tool_class.name] = tool_instance

@ -122,31 +123,24 @@ def validate_after_init(cls, do_validate_forward: bool = True):
    cls.__init__ = new_init
    return cls

-def validate_forward_method_args(cls):
+def validate_args_are_self_contained(source_code):
    """Validates that all names in forward method are properly defined.
    In particular it will check that all imports are done within the function."""
-    if 'forward' not in cls.__dict__:
-        return
-
-    forward = cls.__dict__['forward']
-    source_code = textwrap.dedent(inspect.getsource(forward))
-    tree = ast.parse(source_code)
+    print("CODDDD", source_code)
+    tree = ast.parse(textwrap.dedent(source_code))
    
    # Get function arguments
    func_node = tree.body[0]
-    arg_names = {arg.arg for arg in func_node.args.args}
+    arg_names = {arg.arg for arg in func_node.args.args} | {"kwargs"}

-
-    import builtins
    builtin_names = set(vars(builtins))

-    
-    # Find all used names that aren't arguments or self attributes
    class NameChecker(ast.NodeVisitor):
        def __init__(self):
            self.undefined_names = set()
            self.imports = {}
            self.from_imports = {}
+            self.assigned_names = set()

        def visit_Import(self, node):
            """Handle simple imports like 'import datetime'."""
@ -160,17 +154,63 @@ def validate_forward_method_args(cls):
            for name in node.names:
                actual_name = name.asname or name.name
                self.from_imports[actual_name] = (module, name.name, actual_name)
+
+        def visit_Assign(self, node):
+            """Track variable assignments."""
+            for target in node.targets:
+                if isinstance(target, ast.Name):
+                    self.assigned_names.add(target.id)
+            self.visit(node.value)
            
+        def visit_AnnAssign(self, node):
+            """Track annotated assignments."""
+            if isinstance(node.target, ast.Name):
+                self.assigned_names.add(node.target.id)
+            if node.value:
+                self.visit(node.value)
+
+        def _handle_for_target(self, target) -> Set[str]:
+            """Extract all names from a for loop target."""
+            names = set()
+            if isinstance(target, ast.Name):
+                names.add(target.id)
+            elif isinstance(target, ast.Tuple):
+                for elt in target.elts:
+                    if isinstance(elt, ast.Name):
+                        names.add(elt.id)
+            return names
+                
+        def visit_For(self, node):
+            """Track for-loop target variables and handle enumerate specially."""
+            # Add names from the target
+            target_names = self._handle_for_target(node.target)
+            self.assigned_names.update(target_names)
+            
+            # Special handling for enumerate
+            if (isinstance(node.iter, ast.Call) and 
+                isinstance(node.iter.func, ast.Name) and 
+                node.iter.func.id == 'enumerate'):
+                # For enumerate, if we have "for i, x in enumerate(...)", 
+                # both i and x should be marked as assigned
+                if isinstance(node.target, ast.Tuple):
+                    for elt in node.target.elts:
+                        if isinstance(elt, ast.Name):
+                            self.assigned_names.add(elt.id)
+                            
+            # Visit the rest of the node
+            self.generic_visit(node)
+
        def visit_Name(self, node):
            if (isinstance(node.ctx, ast.Load) and not (
                node.id == "tool" or
                node.id in builtin_names or
                node.id in arg_names or 
-                node.id == 'self'
+                node.id == 'self' or
+                node.id in self.assigned_names
            )):
                if node.id not in self.from_imports and node.id not in self.imports:
                    self.undefined_names.add(node.id)
-                
+                    
        def visit_Attribute(self, node):
            # Skip self.something
            if not (isinstance(node.value, ast.Name) and node.value.id == 'self'):
@ -182,9 +222,7 @@ def validate_forward_method_args(cls):
    if checker.undefined_names:
        raise ValueError(
            f"""The following names in forward method are not defined: {', '.join(checker.undefined_names)}.
-            Make sure all imports and variables are defined within the method.
-            For instance:
-            
+            Make sure all imports and variables are self-contained within the method.            
            """
        )

@ -233,7 +271,6 @@ class Tool:

    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
-        validate_forward_method_args(cls)
        validate_after_init(cls, do_validate_forward=False)


@ -309,17 +346,18 @@ class Tool:

        # Save tool file
        forward_source_code = inspect.getsource(self.forward)
+        validate_args_are_self_contained(forward_source_code)
        tool_code = textwrap.dedent(f"""
-        from agents import Tool
+            from agents import Tool

-        class {class_name}(Tool):
-            name = "{self.name}"
-            description = "{self.description}"
-            inputs = {json.dumps(self.inputs, separators=(',', ':'))}
-            output_type = "{self.output_type}"
-        """).strip()
+            class {class_name}(Tool):
+                name = "{self.name}"
+                description = \"\"\"{self.description}\"\"\"
+                inputs = {json.dumps(self.inputs, separators=(',', ':'))}
+                output_type = "{self.output_type}"
+            """
+        ).strip()

-        import re
        def add_self_argument(source_code: str) -> str:
            """Add 'self' as first argument to a function definition if not present."""
            pattern = r'def forward\(((?!self)[^)]*)\)'
@ -516,6 +554,8 @@ class Tool:
            with open(module_path, "w") as f:
                f.write(tool_code)

+            print("TOOLCODE:\n", tool_code)
+
            # Load module from file path
            spec = importlib.util.spec_from_file_location("custom_tool", module_path)
            module = importlib.util.module_from_spec(spec)
@ -529,7 +569,7 @@ class Tool:
                    break

            if tool_class is None:
-                raise ValueError("No Tool subclass found in the code")
+                raise ValueError("No Tool subclass found in the code.")
        
        if not isinstance(tool_class.inputs, dict):
            tool_class.inputs = ast.literal_eval(tool_class.inputs)
@ -593,9 +633,9 @@ class Tool:
                api_name: Optional[str] = None,
                token: Optional[str] = None,
            ):
-                self.client = Client(space_id, hf_token=token)
                self.name = name
                self.description = description
+                self.client = Client(space_id, hf_token=token)
                space_description = self.client.view_api(
                    return_format="dict", print_info=False
                )["named_endpoints"]
@ -632,6 +672,7 @@ class Tool:
                    self.output_type = "audio"
                else:
                    self.output_type = "any"
+                self.is_initialized = True

            def sanitize_argument_for_prediction(self, arg):
                if isinstance(arg, ImageType):
@ -662,7 +703,7 @@ class Tool:
                return output

        return SpaceToolWrapper(
-            space_id, name, description, api_name=api_name, token=token
+            space_id=space_id, name=name, description=description, api_name=api_name, token=token
        )

    @staticmethod
@ -814,7 +855,13 @@ TOOL_MAPPING = {
 }


-def load_tool(task_or_repo_id, model_repo_id=None, token=None, **kwargs):
+def load_tool(
+        task_or_repo_id,
+        model_repo_id: Optional[str] = None,
+        token: Optional[str] = None,
+        trust_remote_code: bool=False,
+        **kwargs
+    ):
    """
    Main function to quickly load a tool, be it on the Hub or in the Transformers library.

@ -842,6 +889,8 @@ def load_tool(task_or_repo_id, model_repo_id=None, token=None, **kwargs):
        token (`str`, *optional*):
            The token to identify you on hf.co. If unset, will use the token generated when running `huggingface-cli
            login` (stored in `~/.huggingface`).
+        trust_remote_code (`bool`, *optional*, defaults to False):
+            This needs to be accepted in order to load a tool from Hub.
        kwargs (additional keyword arguments, *optional*):
            Additional keyword arguments that will be split in two: all arguments relevant to the Hub (such as
            `cache_dir`, `revision`, `subfolder`) will be used when downloading the files for your tool, and the others
@ -861,7 +910,7 @@ def load_tool(task_or_repo_id, model_repo_id=None, token=None, **kwargs):
            f"code that you have checked."
        )
        return Tool.from_hub(
-            task_or_repo_id, model_repo_id=model_repo_id, token=token, **kwargs
+            task_or_repo_id, model_repo_id=model_repo_id, token=token, trust_remote_code=trust_remote_code, **kwargs
        )


@ -1097,12 +1146,6 @@ class Toolbox:
        """Clears the toolbox"""
        self._tools = {}

-    # def _load_tools_if_needed(self):
-    #     for name, tool in self._tools.items():
-    #         if not isinstance(tool, Tool):
-    #             task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
-    #             self._tools[name] = load_tool(task_or_repo_id)
-
    def __repr__(self):
        toolbox_description = "Toolbox contents:\n"
        for tool in self._tools.values():
--- a/tests/test_all_docs.py
+++ b/tests/test_all_docs.py
@ -18,12 +18,12 @@ import os
 import re
 import shutil
 import tempfile
-import unittest
 import subprocess
+import traceback
 import pytest
 from pathlib import Path
 from typing import List
-
+from dotenv import load_dotenv

 class SubprocessCallException(Exception):
    pass
@ -69,7 +69,9 @@ class DocCodeExtractor:
        combined_code = "\n\n".join(code_blocks)
        assert len(combined_code) > 0, "Code is empty!"
        tmp_file = Path(tmp_dir) / "test_script.py"
-        
+
+        print("COFF", combined_code)
+
        with open(tmp_file, "w", encoding="utf-8") as f:
            f.write(combined_code)
            
@ -86,12 +88,13 @@ class TestDocs:
        cls.docs_dir = Path(__file__).parent.parent / "docs" / "source"
        cls.extractor = DocCodeExtractor()

-        # Verify docs directory exists
        if not cls.docs_dir.exists():
            raise ValueError(f"Docs directory not found at {cls.docs_dir}")
+
+        load_dotenv()
+        cls.hf_token = os.getenv("HF_TOKEN")
        
-        # Verify we have markdown files
-        cls.md_files = list(cls.docs_dir.glob("*.md"))
+        cls.md_files = list(cls.docs_dir.rglob("*.md"))
        if not cls.md_files:
            raise ValueError(f"No markdown files found in {cls.docs_dir}")

@ -99,6 +102,7 @@ class TestDocs:
    def teardown_class(cls):
        shutil.rmtree(cls._tmpdir)

+    @pytest.mark.timeout(2)
    def test_single_doc(self, doc_path: Path):
        """Test a single documentation file."""
        with open(doc_path, "r", encoding="utf-8") as f:
@ -114,13 +118,18 @@ class TestDocs:
        
        # Create and execute test script
        try:
+            excluded_snippets = ["ToolCollection", "image_generation_tool", "from_langchain"]
+            code_blocks = [
+                block.replace("<YOUR_HUGGINGFACEHUB_API_TOKEN>", self.hf_token) for block in code_blocks
+                if not any([snippet in block for snippet in excluded_snippets]) # Exclude these tools that take longer to run and add dependencies
+            ]
            test_script = self.extractor.create_test_script(code_blocks, self._tmpdir)
            run_command(self.launch_args + [str(test_script)])
            
        except SubprocessCallException as e:
-            pytest.fail(str(e))
+            pytest.fail(f"\nError while testing {doc_path.name}:\n{str(e)}")
        except Exception as e:
-            pytest.fail(f"Error testing {doc_path.name}: {str(e)}")
+            pytest.fail(f"\nUnexpected error while testing {doc_path.name}:\n{traceback.format_exc()}")

    @pytest.fixture(autouse=True)
    def _setup(self):
@ -136,11 +145,11 @@ def pytest_generate_tests(metafunc):
    """Generate test cases for each markdown file."""
    if "doc_path" in metafunc.fixturenames:
        test_class = metafunc.cls
-        
+
        # Initialize the class if needed
        if not hasattr(test_class, "md_files"):
            test_class.setup_class()
-            
+
        # Parameterize with the markdown files
        metafunc.parametrize(
            "doc_path",