[metadata] name = crfm-helm version = 0.5.3 author = Stanford CRFM author_email = contact-crfm@stanford.edu description = Benchmark for language models long_description = file: README.md, docs/tutorial.md long_description_content_type = text/markdown keywords = language models benchmarking license = Apache License 2.0 classifiers = Programming Language :: Python :: 3 Programming Language :: Python :: 3 :: Only Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 License :: OSI Approved :: Apache Software License url = https://github.com/stanford-crfm/helm [options] python_requires = >=3.9,<3.12 package_dir = =src packages = find: zip_safe = False include_package_data = True install_requires= # Common cattrs~=22.2 dacite~=1.6 importlib-resources~=5.10 Mako~=1.2 numpy~=1.23 pyhocon~=0.3.59 retrying~=1.3 spacy~=3.5 tqdm~=4.64 zstandard~=0.18.0 # sqlitedict==2.0.0 is slow! https://github.com/RaRe-Technologies/sqlitedict/issues/152 # Keep sqlitedict version at 1.7.0. sqlitedict~=1.7 bottle~=0.12.23 # Basic Scenarios datasets~=2.17 pyarrow>=11.0.0 # Pinned transitive dependency for datasets; workaround for #1026 pyarrow-hotfix~=0.6 # Hotfix for CVE-2023-47248 # Basic metrics nltk~=3.7,<3.8.2 # See https://github.com/stanford-crfm/helm/issues/2926 rouge-score~=0.1.2 scipy~=1.10 uncertainty-calibration~=0.1.4 scikit-learn~=1.1 # Models and Metrics Extras transformers~=4.40 # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics) # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers torch>=1.13.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) torchvision>=0.14.1,<3.0.0 # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics) [options.extras_require] proxy-server = gunicorn~=20.1.0 human-evaluation = scaleapi~=2.13.0 surge-api~=1.1.0 scenarios = gdown~=5.1 # For disinformation_scenario, med_mcqa_scenario, med_qa_scenario: used by ensure_file_downloaded() sympy~=1.11.1 # For numeracy_scenario xlrd~=2.0.1 # For ice_scenario: used by pandas.read_excel() metrics = google-api-python-client~=2.64 # For perspective_api_client via toxicity_metrics numba~=0.56 # For copyright_metrics pytrec_eval==0.5 # For ranking_metrics sacrebleu~=2.2.1 # For disinformation_metrics, machine_translation_metrics summarization = summ-eval~=0.892 # For summarization_metrics plots = colorcet~=3.0.1 matplotlib~=3.6.0 seaborn~=0.11.0 decodingtrust = fairlearn~=0.9.0 slurm = simple-slurm~=0.2.6 cleva = unidecode==1.3.6 pypinyin==0.49.0 jieba==0.42.1 opencc==1.1.6 langdetect==1.0.9 images = crfm-helm[accelerate] pillow~=10.2 mongo = pymongo~=4.2 unitxt = evaluate~=0.4.1 bhasa = pythainlp==5.0.0 pyonmttok==1.37.0 sacrebleu~=2.2.1 # Model extras accelerate = accelerate~=0.25 aleph-alpha = aleph-alpha-client~=2.14.0 tokenizers>=0.13.3 openvino = optimum[openvino]~=1.19 allenai = ai2-olmo~=0.2 amazon = boto3~=1.28.57 awscli~=1.29.57 botocore~=1.31.57 anthropic = anthropic~=0.17 websocket-client~=1.3.2 # For legacy stanford-online-all-v4-s3 cohere = cohere~=5.3 mistral = mistralai~=0.0.11 openai = openai~=1.0 tiktoken~=0.7 pydantic~=2.0 # For model_dump(mode="json") - openai only requires pydantic>=1.9.0 google = google-cloud-aiplatform~=1.48 together = together~=1.1 yandex = sentencepiece~=0.1.97 models = crfm-helm[ai21] crfm-helm[accelerate] crfm-helm[aleph-alpha] crfm-helm[allenai] crfm-helm[amazon] crfm-helm[anthropic] crfm-helm[cohere] crfm-helm[google] crfm-helm[mistral] crfm-helm[openai] crfm-helm[reka] crfm-helm[together] crfm-helm[yandex] crfm-helm[openvino] reka = reka-api~=2.0.0 vlm = crfm-helm[openai] # For OpenFlamingo einops~=0.7.0 einops-exts~=0.0.4 open-clip-torch~=2.24 # For IDEFICS torch~=2.1 # For Qwen: https://github.com/QwenLM/Qwen-VL/blob/master/requirements.txt transformers_stream_generator~=0.0.4 scipy~=1.10 torchvision>=0.14.1,<3.0.0 # For Reka AI crfm-helm[reka] # VLM scenarios crfm-helm[images] crfm-helm[image2struct] # For metrics pycocoevalcap~=1.2 image2struct = crfm-helm[images] # Latex # You will need to install LaTeX separately. # You can run `sudo apt-get install texlive-full` on Ubuntu. latex~=0.7.0 pdf2image~=1.16.3 # Webpage # You will need install Jekyll separately. selenium~=4.17.2 html2text~=2024.2.26 # Metrics opencv-python~=4.7.0.68 lpips~=0.1.4 imagehash~=4.3.1 # for caching heim = # HEIM scenarios gdown~=5.1 # HEIM models diffusers~=0.24.0 icetk~=0.0.4 jax~=0.4.13 jaxlib~=0.4.13 crfm-helm[openai] # For model, kakaobrain/mindall-e einops~=0.7.0 omegaconf~=2.3.0 pytorch-lightning~=2.0.5 # For model, craiyon/dalle-mini and craiyon/dalle-mega flax~=0.6.11 ftfy~=6.1.1 Unidecode~=1.3.6 wandb~=0.13.11 # HEIM perturbations google-cloud-translate~=3.11.2 # HEIM metrics autokeras~=1.0.20 clip-anytorch~=2.5.0 google-cloud-storage~=2.9 lpips~=0.1.4 multilingual-clip~=1.0.10 NudeNet~=2.0.9 opencv-python~=4.7.0.68 pytorch-fid~=0.3.0 tensorflow~=2.11 timm~=0.6.12 torch-fidelity~=0.3.0 torchmetrics~=0.11.1 # Transitive dependency of NudeNet # This needs to be a version that provides wheels for all Python versions # supported by crfm-helm i.e. Python 3.9, 3.10, 3.11, 3.12 # Disallow version 0.23.* because it has no Python 3.9 wheels. scikit-image>=0.22,==0.*,!=0.23.* # Shared image dependencies crfm-helm[images] # Install everything all = crfm-helm[proxy-server] crfm-helm[human-evaluation] crfm-helm[scenarios] crfm-helm[metrics] crfm-helm[plots] crfm-helm[decodingtrust] crfm-helm[slurm] crfm-helm[cleva] crfm-helm[images] crfm-helm[models] crfm-helm[mongo] crfm-helm[heim] crfm-helm[vlm] crfm-helm[bhasa] # crfm-helm[dev] is excluded because end-users don't need it. # crfm-helm[summarize] is excluded because it requires torch<2.0 # TODO(#2280): Add crfm-helm[summarize] back. # Development only # Do not include in all dev = pytest~=7.2.0 pre-commit~=2.20.0 # Errors produced by type checkers and linters are very version-specific # so they are pinned to an exact version. black==24.3.0 mypy==1.5.1 flake8==5.0.4 [options.entry_points] console_scripts = helm-run = helm.benchmark.run:main helm-summarize = helm.benchmark.presentation.summarize:main helm-server = helm.benchmark.server:main helm-create-plots = helm.benchmark.presentation.create_plots:main crfm-proxy-server = helm.proxy.server:main crfm-proxy-cli = helm.proxy.cli:main [options.packages.find] where = src exclude = tests* # Settings for Flake8: Tool For Style Guide Enforcement [flake8] max-line-length = 120 exclude = venv/* src/helm/clients/image_generation/dalle_mini/* src/helm/clients/image_generation/mindalle/* src/helm/clients/vision_language/open_flamingo/* # Ignore completely: # E203 - White space before ':', (conflicts with black) # E231 - Missing whitespace after ',', ';', or ':' # E731 - do not assign a lambda expression, use a def # W503 - line break before binary operator, (conflicts with black) # W605 - invalid escape sequence '\', (causes failures) ignore = E203,E231,E731,W503,W605 # Settings for Mypy: static type checker for Python 3 [mypy] ignore_missing_imports = True check_untyped_defs = True # TODO: Remove disable_error_code disable_error_code = annotation-unchecked # TODO: Change disallow_untyped_defs to True disallow_untyped_defs = False exclude = dalle_mini|mindalle|open_flamingo [tool:pytest] addopts = # By default: # - we don't test models because doing so will # make real requests and spend real money # - we don't test scenarios because these will # download files, which is slow, consumes disk # space, and increases the chance of spurious # test failures due to failed downloads. # # For more documentation on pytest markers, see: # - https://docs.pytest.org/en/latest/how-to/mark.html#mark # - https://docs.pytest.org/en/latest/example/markers.html#mark-examples -m 'not models and not scenarios' markers = # Marker for model tests that make real model requests models # Marker for scenario tests that download files scenarios