+ The page you are looking for might have been removed, had its name + changed, or is temporarily unavailable. +
+diff --git a/.editorconfig b/.editorconfig new file mode 100755 index 0000000..a257bcd --- /dev/null +++ b/.editorconfig @@ -0,0 +1,14 @@ +; https://editorconfig.org + +root = true + +[*] +charset = utf-8 +end_of_line = lf +indent_size = 2 +indent_style = space +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +trim_trailing_whitespace = false diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..dfb1456 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,38 @@ +stages: + - build + +variables: + HUGO_ENV: production + HUGO_VERSION: "0.115.4" + GO_VERSION: "1.20.5" + NODE_VERSION: "18.16.1" + +cache: + paths: + - node_modules/ + +default: + image: node:${NODE_VERSION} + before_script: + - echo "USING NODE ${NODE_VERSION}" + - apt-get update && apt-get install -y curl + - curl -LO "https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_Linux-64bit.tar.gz" + - tar -xvf hugo_extended_${HUGO_VERSION}_Linux-64bit.tar.gz + - mv hugo /usr/local/bin/ + - rm hugo_extended_${HUGO_VERSION}_Linux-64bit.tar.gz + - echo "HUGO ${HUGO_VERSION} INSTALLED" + - curl -LO "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" + - tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz + - export PATH=$PATH:/usr/local/go/bin + - rm go${GO_VERSION}.linux-amd64.tar.gz + - echo "GO ${GO_VERSION} INSTALLED" + - npm install + +pages: + stage: build + script: + - npm run project-setup + - npm run build + artifacts: + paths: + - public diff --git a/.jshintrc b/.jshintrc new file mode 100644 index 0000000..873eef9 --- /dev/null +++ b/.jshintrc @@ -0,0 +1,59 @@ +{ + "maxerr": 50, + "bitwise": true, + "camelcase": false, + "curly": true, + "eqeqeq": true, + "forin": true, + "freeze": true, + "immed": true, + "indent": 2, + "latedef": true, + "newcap": false, + "noarg": true, + "noempty": true, + "nonbsp": true, + "nonew": true, + "plusplus": false, + "undef": true, + "unused": false, + "strict": true, + "maxparams": false, + "maxdepth": 4, + "maxstatements": false, + "maxcomplexity": false, + "maxlen": 400, + "browser": true, + "devel": true, + "asi": false, + "boss": false, + "debug": false, + "eqnull": false, + "es3": false, + "es5": false, + "esversion": 12, + "moz": false, + "evil": true, + "expr": true, + "funcscope": false, + "globalstrict": false, + "iterator": false, + "lastsemic": false, + "laxbreak": false, + "laxcomma": false, + "loopfunc": true, + "multistr": true, + "noyield": false, + "notypeof": false, + "proto": false, + "scripturl": false, + "shadow": false, + "sub": false, + "supernew": false, + "validthis": false, + "globals": { + "jQuery": false, + "google": false, + "$": false + } +} diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100755 index 0000000..7d41e3f --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,5 @@ +{ + "MD033": false, + "MD034": false, + "MD013": false +} diff --git a/.prettierrc b/.prettierrc new file mode 100755 index 0000000..ee30508 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,13 @@ +{ + "plugins": ["prettier-plugin-go-template"], + "overrides": [ + { + "files": ["*.html"], + "options": { + "parser": "go-template", + "goTemplateBracketSpacing": true, + "bracketSameLine": true + } + } + ] +} diff --git a/app/Dockerfile b/app/Dockerfile index 6bef32a..0becc00 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -1,24 +1,63 @@ -FROM python:3.8.18 +# Stage 1: Build stage +FROM node:20.12.2 as builder -# Uncomment the following lines to make PyTorch available to your application. -# See https://skiff.allenai.org/gpu.html for more details. -# -# ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 -# ENV NVIDIA_VISIBLE_DEVICES all -# ENV NVIDIA_DRIVER_CAPABILITIES compute,utility -# RUN pip install torch==1.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +# Set environment variables +ENV HUGO_ENV production +ENV HUGO_VERSION 0.115.4 +ENV GO_VERSION 1.22.2 +ENV BASE_URL http://localhost +ENV BASE_PORT 8000 + +# Install Hugo +RUN apt-get update && apt-get install -y curl \ + && curl -LO "https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_Linux-64bit.tar.gz" \ + && tar -xvf hugo_extended_${HUGO_VERSION}_Linux-64bit.tar.gz \ + && mv hugo /usr/local/bin/ \ + && rm hugo_extended_${HUGO_VERSION}_Linux-64bit.tar.gz \ + && echo "Hugo ${HUGO_VERSION} installed" + +# Install Go +RUN curl -LO "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" \ + && tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz \ + && rm go${GO_VERSION}.linux-amd64.tar.gz \ + && echo "export PATH=$PATH:/usr/local/go/bin" >> /etc/profile \ + && . /etc/profile \ + && echo "Go ${GO_VERSION} installed" + +# Export the PATH variable +RUN export PATH=$PATH:/usr/local/go/bin + +# Set working directory WORKDIR /app -# Install Python dependencies -COPY requirements.txt . -RUN pip install -r requirements.txt +# Copy package.json and package-lock.json +COPY package*.json ./ + +# Install project dependencies +RUN npm install + +# Copy project files +COPY . . + +# Run project setup and build +RUN PATH=$PATH:/usr/local/go/bin hugo --gc --minify --templateMetrics --templateMetricsHints --forceSyncStatic -e production --minify + +# --baseURL=$BASE_URL:$BASE_PORT + + +# Stage 2: Serve stage +FROM nginx:alpine +# FROM tiangolo/uwsgi-nginx-flask:python3.11 + +# Copy built site from builder stage +COPY --from=builder /app/public /usr/share/nginx/html +COPY --from=builder /app/nginx.conf /etc/nginx/conf.d/default.conf -# Copy over the source code -COPY . ./ +# Expose port 80 +EXPOSE 8000 -ENV PYTHONPATH=$PYTHONPATH':app/' +# Command to start nginx +CMD ["nginx", "-g", "daemon off;"] -# Kick things off -ENTRYPOINT [ "streamlit" ] -CMD [ "run", "./app.py" ] +# CMD ["python3", "-m", "http.server", "-d", "/usr/share/nginx/html", "8000"] diff --git a/app/LICENSE b/app/LICENSE new file mode 100644 index 0000000..30343a1 --- /dev/null +++ b/app/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2023 - Present, Zeon Studio + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/app/amplify.yml b/app/amplify.yml new file mode 100644 index 0000000..d6498c9 --- /dev/null +++ b/app/amplify.yml @@ -0,0 +1,29 @@ +version: 1 +frontend: + phases: + preBuild: + commands: + - yum install -y curl + - curl -LO "https://github.com/gohugoio/hugo/releases/download/v0.115.4/hugo_extended_0.115.4_Linux-64bit.tar.gz" + - tar -xvf hugo_extended_0.115.4_Linux-64bit.tar.gz + - mv hugo /usr/local/bin/ + - rm hugo_extended_0.115.4_Linux-64bit.tar.gz + - echo "HUGO 0.115.4 INSTALLED" + - curl -LO "https://dl.google.com/go/go1.20.5.linux-amd64.tar.gz" + - tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz + - export PATH=$PATH:/usr/local/go/bin + - rm go1.20.5.linux-amd64.tar.gz + - echo "GO 1.20.5 INSTALLED" + - npm install + build: + commands: + - npm run project-setup + - npm run build + artifacts: + # IMPORTANT - Please verify your build output directory + baseDirectory: /public + files: + - "**/*" + cache: + paths: + - node_modules/**/* diff --git a/app/app.py b/app/app.py deleted file mode 100644 index ae7bd32..0000000 --- a/app/app.py +++ /dev/null @@ -1,232 +0,0 @@ -import datetime as dt -import itertools - -import pandas as pd -import streamlit as st -from PIL import Image - -from src.api.api import filter_resources -from src.components.goat_counter import add_goat_counter_tracker -from src.constants import BASE_DIR, ORDERED_SECTION_HEADERS -from src.theme import theme -from src.utils import create_markdown_img, load_data, load_logos - -pd.options.display.html.border = 0 - - -def write_resource(row, logos: dict) -> None: - col1, col2, col3, col4 = st.columns([0.4, 1, 5, 1], gap="small") - - modality_icons = [] - for mod_img, modality in [ - (logos["text"], "Text"), - (logos["vision"], "Vision"), - (logos["speech"], "Speech"), - ]: - mod_icon = ( - create_markdown_img(mod_img, None, 20) - if modality in row["Modalities"] - else " " - ) - modality_icons.append(mod_icon) - col1.markdown(" ".join(modality_icons), unsafe_allow_html=True) - - col2.write(row["Name"]) - col3.write(row["Description"]) - - logo_links = [] - for logo_img, col in [ - (logos["arxiv"], "Paper Link"), - (logos["hf"], "HuggingFace Link"), - (logos["github"], "GitHub Link"), - (logos["web"], "Website Link"), - ]: - logo_link = ( - create_markdown_img(logo_img, row[col], dim=20) if row[col] else " " - ) # "
" - logo_links.append(logo_link) - # col4.markdown(logo_link, unsafe_allow_html=True) - col4.markdown(" ".join(logo_links), unsafe_allow_html=True) - - -def streamlit_app(): - st.set_page_config( - page_title="Foundation Model Development Cheatsheet", layout="wide" - ) # , initial_sidebar_state='collapsed') - - RESOURCES = load_data() - LOGOS = load_logos() - - # add analytics tracking - add_goat_counter_tracker() - # add custom AI2 branded CSS theme and header banner - theme.add_theme() - - st.markdown( - "" - + create_markdown_img(LOGOS["cheatsheet"], "/", 220) - + "
", - unsafe_allow_html=True, - ) - - # st.title("Foundation Model Development Cheatsheet") - st.markdown( - "Assembled by open model developers from AI2, EleutherAI, Google, Hugging Face, Masakhane, MIT, MLCommons, Princeton, Stanford CRFM, University of California Santa Barbara (UCSB), Univesity College London (UCL) and University of Washington (UW).
", - unsafe_allow_html=True, - ) - st.image("resources/orgs.png", use_column_width=True) - st.markdown("Modalities:
', - unsafe_allow_html=True, - ) - - # col1, col2, col3 = st.columns([1, 2, 1], gap="medium") - # st.markdown("Modality Types:") - checkbox_text = st.checkbox("Text", value=True) - checkbox_vision = st.checkbox("Vision") - checkbox_speech = st.checkbox("Speech") - - date_format = "MMM, YYYY" # format output - start_date = dt.date(year=2000, month=1, day=1) - end_date = dt.datetime.now().date() - # max_days = end_date - start_date - - time_selection = st.slider( - label="Start Date:", - min_value=start_date, - value=start_date, - max_value=end_date, - format=date_format, - ) - - st.divider() - - # Every form must have a submit button. - submitted = st.form_submit_button("Submit Selection") - - #### FILTER MENU ENDS HERE - - # legend: - st.divider() - st.markdown("**Legend**") - col0, col1, col2, col3 = st.columns([1, 1, 1, 1], gap="small") - text_img = create_markdown_img(LOGOS["text"], None, 20) - vision_img = create_markdown_img(LOGOS["vision"], None, 20) - speech_img = create_markdown_img(LOGOS["speech"], None, 20) - arxiv_img = create_markdown_img(LOGOS["arxiv"], None, 20) - hf_img = create_markdown_img(LOGOS["hf"], None, 20) - github_img = create_markdown_img(LOGOS["github"], None, 20) - web_img = create_markdown_img(LOGOS["web"], None, 20) - - col1.markdown(text_img + " = Text Modality", unsafe_allow_html=True) - col1.markdown(vision_img + " = Vision Modality", unsafe_allow_html=True) - col1.markdown(speech_img + " = Speech Modality", unsafe_allow_html=True) - - col2.markdown(arxiv_img + " = Paper Link", unsafe_allow_html=True) - col2.markdown(hf_img + " = HuggingFace Link", unsafe_allow_html=True) - col2.markdown(github_img + " = GitHub Link", unsafe_allow_html=True) - col2.markdown(web_img + " = Web Link", unsafe_allow_html=True) - st.divider() - - if submitted: - for category in [s for s in ORDERED_SECTION_HEADERS if s in category_select]: - - filtered_resources = filter_resources( - RESOURCES, - # sections=category_select, - sections=[category], - text_mod=checkbox_text, - vision_mod=checkbox_vision, - speech_mod=checkbox_speech, - time_range=time_selection, - ) - - html_table = filtered_resources.to_html( - columns=["Modality", "Name", "Description", "Links"], - index=False, - header=False, - escape=False, - border=0, - ) - st.header(category) - st.write(ORDERED_SECTION_HEADERS[category]) - st.write(html_table, unsafe_allow_html=True) - st.divider() - - # Please don't edit or remove the content of this footer as we'd like to include these important - # links on all AI2 applications - theme.add_footer() - - -if __name__ == "__main__": - streamlit_app() diff --git a/app/assets/icons.py b/app/assets/icons.py new file mode 100644 index 0000000..5d1ab2f --- /dev/null +++ b/app/assets/icons.py @@ -0,0 +1,79 @@ +import os +import requests +import json +from urllib.parse import urlparse + +def get_github_profile_logo(github_link, save_dir, github_token): + if github_link == "": + return None + + # Extracting username and repository from GitHub link + parsed_url = urlparse(github_link) + path_parts = parsed_url.path.strip("/").split("/") + if len(path_parts) < 1: + print(f"Invalid GitHub link: {github_link}") + return None + username = path_parts[0] + + # Making a request to GitHub API to get user information + api_url = f"https://api.github.com/users/{username}" + headers = { + 'Authorization': f'token {github_token}', + 'User-Agent': 'Your-User-Agent' # Replace 'Your-User-Agent' with your actual user agent + } + response = requests.get(api_url, headers=headers) + + if response.status_code == 200: + user_info = response.json() + profile_logo_url = user_info['avatar_url'] + + # Downloading the profile logo and saving it to the specified directory + logo_filename = f"{username}_logo.png" + logo_filepath = os.path.join(save_dir, logo_filename) + with open(logo_filepath, 'wb') as f: + f.write(requests.get(profile_logo_url).content) + + return logo_filepath + else: + print(f"Failed to fetch profile logo for {github_link}") + print(f"GitHub API response: {response.status_code} - {response.reason}") + return None + + +def update_resources_json(resources, github_profile_logos, resources_file): + for i, resource in enumerate(resources): + if i < len(github_profile_logos): # Check index to avoid out of bounds + logo_path = github_profile_logos[i] + resource['logo'] = os.path.basename(logo_path) + else: + resource['logo'] = "" # Or provide a default placeholder + + with open(resources_file, 'w') as f: + json.dump(resources, f, indent=2) + +def main(): + resources_file = '/home/iguana/WebstormProjects/fm-cheatsheet/assets/resources.json' + save_dir = '/home/iguana/WebstormProjects/fm-cheatsheet/assets/images/gh-icons' + github_token = '' # Replace 'Your-GitHub-Token' with your actual GitHub token + + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + with open(resources_file) as f: + resources = json.load(f) + + for resource in resources: + github_link = resource.get('github_link') + if github_link: # Only process resources with a GitHub link + profile_logo_path = get_github_profile_logo(github_link, save_dir, github_token) + if profile_logo_path: + resource['logo'] = os.path.basename(profile_logo_path) + # Update resources.json immediately + with open(resources_file, 'w') as f: + json.dump(resources, f, indent=2) + + print("GitHub profile logos saved to:", save_dir) # All logos saved + print("resources.json updated with logo property") # resources.json is already updated + +if __name__ == "__main__": + main() diff --git a/app/assets/images/aimodelsorg-replace.png b/app/assets/images/aimodelsorg-replace.png new file mode 100644 index 0000000..2c2963c Binary files /dev/null and b/app/assets/images/aimodelsorg-replace.png differ diff --git a/app/assets/images/cached-icons/AI4Bharat_logo.png b/app/assets/images/cached-icons/AI4Bharat_logo.png new file mode 100644 index 0000000..0cf6d10 Binary files /dev/null and b/app/assets/images/cached-icons/AI4Bharat_logo.png differ diff --git a/app/assets/images/cached-icons/ARBML_logo.png b/app/assets/images/cached-icons/ARBML_logo.png new file mode 100644 index 0000000..2585d35 Binary files /dev/null and b/app/assets/images/cached-icons/ARBML_logo.png differ diff --git a/app/assets/images/cached-icons/BradyFU_logo.png b/app/assets/images/cached-icons/BradyFU_logo.png new file mode 100644 index 0000000..6114667 Binary files /dev/null and b/app/assets/images/cached-icons/BradyFU_logo.png differ diff --git a/app/assets/images/cached-icons/Breakend_logo.png b/app/assets/images/cached-icons/Breakend_logo.png new file mode 100644 index 0000000..f54863b Binary files /dev/null and b/app/assets/images/cached-icons/Breakend_logo.png differ diff --git a/app/assets/images/cached-icons/CarperAI_logo.png b/app/assets/images/cached-icons/CarperAI_logo.png new file mode 100644 index 0000000..fdfd30c Binary files /dev/null and b/app/assets/images/cached-icons/CarperAI_logo.png differ diff --git a/app/assets/images/cached-icons/DAI-Lab_logo.png b/app/assets/images/cached-icons/DAI-Lab_logo.png new file mode 100644 index 0000000..5de3f07 Binary files /dev/null and b/app/assets/images/cached-icons/DAI-Lab_logo.png differ diff --git a/app/assets/images/cached-icons/Data-Provenance-Initiative_logo.png b/app/assets/images/cached-icons/Data-Provenance-Initiative_logo.png new file mode 100644 index 0000000..0a83875 Binary files /dev/null and b/app/assets/images/cached-icons/Data-Provenance-Initiative_logo.png differ diff --git a/app/assets/images/cached-icons/EleutherAI_logo.png b/app/assets/images/cached-icons/EleutherAI_logo.png new file mode 100644 index 0000000..0a54216 Binary files /dev/null and b/app/assets/images/cached-icons/EleutherAI_logo.png differ diff --git a/app/assets/images/cached-icons/IndoNLP_logo.png b/app/assets/images/cached-icons/IndoNLP_logo.png new file mode 100644 index 0000000..2bc6c2e Binary files /dev/null and b/app/assets/images/cached-icons/IndoNLP_logo.png differ diff --git a/app/assets/images/cached-icons/LAION-AI_logo.png b/app/assets/images/cached-icons/LAION-AI_logo.png new file mode 100644 index 0000000..5506947 Binary files /dev/null and b/app/assets/images/cached-icons/LAION-AI_logo.png differ diff --git a/app/assets/images/cached-icons/Luodian_logo.png b/app/assets/images/cached-icons/Luodian_logo.png new file mode 100644 index 0000000..b0aa215 Binary files /dev/null and b/app/assets/images/cached-icons/Luodian_logo.png differ diff --git a/app/assets/images/cached-icons/MMMU-Benchmark_logo.png b/app/assets/images/cached-icons/MMMU-Benchmark_logo.png new file mode 100644 index 0000000..05b5db2 Binary files /dev/null and b/app/assets/images/cached-icons/MMMU-Benchmark_logo.png differ diff --git a/app/assets/images/cached-icons/Mimino666_logo.png b/app/assets/images/cached-icons/Mimino666_logo.png new file mode 100644 index 0000000..b19d84a Binary files /dev/null and b/app/assets/images/cached-icons/Mimino666_logo.png differ diff --git a/app/assets/images/cached-icons/NVIDIA_logo.png b/app/assets/images/cached-icons/NVIDIA_logo.png new file mode 100644 index 0000000..36594d2 Binary files /dev/null and b/app/assets/images/cached-icons/NVIDIA_logo.png differ diff --git a/app/assets/images/cached-icons/OpenAccess-AI-Collective_logo.png b/app/assets/images/cached-icons/OpenAccess-AI-Collective_logo.png new file mode 100644 index 0000000..acc9021 Binary files /dev/null and b/app/assets/images/cached-icons/OpenAccess-AI-Collective_logo.png differ diff --git a/app/assets/images/cached-icons/OpenGVLab_logo.png b/app/assets/images/cached-icons/OpenGVLab_logo.png new file mode 100644 index 0000000..f70dc36 Binary files /dev/null and b/app/assets/images/cached-icons/OpenGVLab_logo.png differ diff --git a/app/assets/images/cached-icons/PAIR-code_logo.png b/app/assets/images/cached-icons/PAIR-code_logo.png new file mode 100644 index 0000000..84b2213 Binary files /dev/null and b/app/assets/images/cached-icons/PAIR-code_logo.png differ diff --git a/app/assets/images/cached-icons/Ren-Research_logo.png b/app/assets/images/cached-icons/Ren-Research_logo.png new file mode 100644 index 0000000..c397f61 Binary files /dev/null and b/app/assets/images/cached-icons/Ren-Research_logo.png differ diff --git a/app/assets/images/cached-icons/SEACrowd_logo.png b/app/assets/images/cached-icons/SEACrowd_logo.png new file mode 100644 index 0000000..b7217e8 Binary files /dev/null and b/app/assets/images/cached-icons/SEACrowd_logo.png differ diff --git a/app/assets/images/cached-icons/SpeechColab_logo.png b/app/assets/images/cached-icons/SpeechColab_logo.png new file mode 100644 index 0000000..44d32ee Binary files /dev/null and b/app/assets/images/cached-icons/SpeechColab_logo.png differ diff --git a/app/assets/images/cached-icons/Stability-AI_logo.png b/app/assets/images/cached-icons/Stability-AI_logo.png new file mode 100644 index 0000000..26e316d Binary files /dev/null and b/app/assets/images/cached-icons/Stability-AI_logo.png differ diff --git a/app/assets/images/cached-icons/Vision-CAIR_logo.png b/app/assets/images/cached-icons/Vision-CAIR_logo.png new file mode 100644 index 0000000..2ba4500 Binary files /dev/null and b/app/assets/images/cached-icons/Vision-CAIR_logo.png differ diff --git a/app/assets/images/cached-icons/Yuchen413_logo.png b/app/assets/images/cached-icons/Yuchen413_logo.png new file mode 100644 index 0000000..ab93c4e Binary files /dev/null and b/app/assets/images/cached-icons/Yuchen413_logo.png differ diff --git a/app/assets/images/cached-icons/allenai_logo.png b/app/assets/images/cached-icons/allenai_logo.png new file mode 100644 index 0000000..42faed2 Binary files /dev/null and b/app/assets/images/cached-icons/allenai_logo.png differ diff --git a/app/assets/images/cached-icons/alon-albalak_logo.png b/app/assets/images/cached-icons/alon-albalak_logo.png new file mode 100644 index 0000000..56ff1a3 Binary files /dev/null and b/app/assets/images/cached-icons/alon-albalak_logo.png differ diff --git a/app/assets/images/cached-icons/artidoro_logo.png b/app/assets/images/cached-icons/artidoro_logo.png new file mode 100644 index 0000000..a96e4ce Binary files /dev/null and b/app/assets/images/cached-icons/artidoro_logo.png differ diff --git a/app/assets/images/cached-icons/bertiev_logo.png b/app/assets/images/cached-icons/bertiev_logo.png new file mode 100644 index 0000000..bfea869 Binary files /dev/null and b/app/assets/images/cached-icons/bertiev_logo.png differ diff --git a/app/assets/images/cached-icons/bigcode-project_logo.png b/app/assets/images/cached-icons/bigcode-project_logo.png new file mode 100644 index 0000000..48729ba Binary files /dev/null and b/app/assets/images/cached-icons/bigcode-project_logo.png differ diff --git a/app/assets/images/cached-icons/bigscience-workshop_logo.png b/app/assets/images/cached-icons/bigscience-workshop_logo.png new file mode 100644 index 0000000..109bb27 Binary files /dev/null and b/app/assets/images/cached-icons/bigscience-workshop_logo.png differ diff --git a/app/assets/images/cached-icons/cisnlp_logo.png b/app/assets/images/cached-icons/cisnlp_logo.png new file mode 100644 index 0000000..8a7e4d4 Binary files /dev/null and b/app/assets/images/cached-icons/cisnlp_logo.png differ diff --git a/app/assets/images/cached-icons/dadelani_logo.png b/app/assets/images/cached-icons/dadelani_logo.png new file mode 100644 index 0000000..b72e0b5 Binary files /dev/null and b/app/assets/images/cached-icons/dadelani_logo.png differ diff --git a/app/assets/images/cached-icons/datasets_logo.png b/app/assets/images/cached-icons/datasets_logo.png new file mode 100644 index 0000000..ad5eef8 Binary files /dev/null and b/app/assets/images/cached-icons/datasets_logo.png differ diff --git a/app/assets/images/cached-icons/embeddings-benchmark_logo.png b/app/assets/images/cached-icons/embeddings-benchmark_logo.png new file mode 100644 index 0000000..096888f Binary files /dev/null and b/app/assets/images/cached-icons/embeddings-benchmark_logo.png differ diff --git a/app/assets/images/cached-icons/facebookresearch_logo.png b/app/assets/images/cached-icons/facebookresearch_logo.png new file mode 100644 index 0000000..53934ad Binary files /dev/null and b/app/assets/images/cached-icons/facebookresearch_logo.png differ diff --git a/app/assets/images/cached-icons/github_logo.png b/app/assets/images/cached-icons/github_logo.png new file mode 100644 index 0000000..893151b Binary files /dev/null and b/app/assets/images/cached-icons/github_logo.png differ diff --git a/app/assets/images/cached-icons/google-research_logo.png b/app/assets/images/cached-icons/google-research_logo.png new file mode 100644 index 0000000..7aa70dc Binary files /dev/null and b/app/assets/images/cached-icons/google-research_logo.png differ diff --git a/app/assets/images/cached-icons/google_logo.png b/app/assets/images/cached-icons/google_logo.png new file mode 100644 index 0000000..394b500 Binary files /dev/null and b/app/assets/images/cached-icons/google_logo.png differ diff --git a/app/assets/images/cached-icons/haotian-liu_logo.png b/app/assets/images/cached-icons/haotian-liu_logo.png new file mode 100644 index 0000000..8968b58 Binary files /dev/null and b/app/assets/images/cached-icons/haotian-liu_logo.png differ diff --git a/app/resources/logos/hf.png b/app/assets/images/cached-icons/huggingface_logo.png similarity index 100% rename from app/resources/logos/hf.png rename to app/assets/images/cached-icons/huggingface_logo.png diff --git a/app/assets/images/cached-icons/jmschrei_logo.png b/app/assets/images/cached-icons/jmschrei_logo.png new file mode 100644 index 0000000..4100a5f Binary files /dev/null and b/app/assets/images/cached-icons/jmschrei_logo.png differ diff --git a/app/assets/images/cached-icons/karpathy_logo.png b/app/assets/images/cached-icons/karpathy_logo.png new file mode 100644 index 0000000..a4c5096 Binary files /dev/null and b/app/assets/images/cached-icons/karpathy_logo.png differ diff --git a/app/assets/images/cached-icons/keirp_logo.png b/app/assets/images/cached-icons/keirp_logo.png new file mode 100644 index 0000000..0df2aa9 Binary files /dev/null and b/app/assets/images/cached-icons/keirp_logo.png differ diff --git a/app/assets/images/cached-icons/kernelmachine_logo.png b/app/assets/images/cached-icons/kernelmachine_logo.png new file mode 100644 index 0000000..e9f66f3 Binary files /dev/null and b/app/assets/images/cached-icons/kernelmachine_logo.png differ diff --git a/app/assets/images/cached-icons/laurieburchell_logo.png b/app/assets/images/cached-icons/laurieburchell_logo.png new file mode 100644 index 0000000..e86e9dd Binary files /dev/null and b/app/assets/images/cached-icons/laurieburchell_logo.png differ diff --git a/app/assets/images/cached-icons/leondz_logo.png b/app/assets/images/cached-icons/leondz_logo.png new file mode 100644 index 0000000..c13c71f Binary files /dev/null and b/app/assets/images/cached-icons/leondz_logo.png differ diff --git a/app/assets/images/cached-icons/lfwa_logo.png b/app/assets/images/cached-icons/lfwa_logo.png new file mode 100644 index 0000000..44177c4 Binary files /dev/null and b/app/assets/images/cached-icons/lfwa_logo.png differ diff --git a/app/assets/images/cached-icons/lilacai_logo.png b/app/assets/images/cached-icons/lilacai_logo.png new file mode 100644 index 0000000..7ea9317 Binary files /dev/null and b/app/assets/images/cached-icons/lilacai_logo.png differ diff --git a/app/assets/images/cached-icons/lm-sys_logo.png b/app/assets/images/cached-icons/lm-sys_logo.png new file mode 100644 index 0000000..44b3871 Binary files /dev/null and b/app/assets/images/cached-icons/lm-sys_logo.png differ diff --git a/app/assets/images/cached-icons/m-bain_logo.png b/app/assets/images/cached-icons/m-bain_logo.png new file mode 100644 index 0000000..f335c32 Binary files /dev/null and b/app/assets/images/cached-icons/m-bain_logo.png differ diff --git a/app/assets/images/cached-icons/masakhane-io_logo.png b/app/assets/images/cached-icons/masakhane-io_logo.png new file mode 100644 index 0000000..63ab400 Binary files /dev/null and b/app/assets/images/cached-icons/masakhane-io_logo.png differ diff --git a/app/assets/images/cached-icons/microsoft_logo.png b/app/assets/images/cached-icons/microsoft_logo.png new file mode 100644 index 0000000..a6dc08f Binary files /dev/null and b/app/assets/images/cached-icons/microsoft_logo.png differ diff --git a/app/assets/images/cached-icons/mlco2_logo.png b/app/assets/images/cached-icons/mlco2_logo.png new file mode 100644 index 0000000..6652b86 Binary files /dev/null and b/app/assets/images/cached-icons/mlco2_logo.png differ diff --git a/app/assets/images/cached-icons/mlfoundations_logo.png b/app/assets/images/cached-icons/mlfoundations_logo.png new file mode 100644 index 0000000..a961bc5 Binary files /dev/null and b/app/assets/images/cached-icons/mlfoundations_logo.png differ diff --git a/app/assets/images/cached-icons/nayeon7lee_logo.png b/app/assets/images/cached-icons/nayeon7lee_logo.png new file mode 100644 index 0000000..5d0f4ce Binary files /dev/null and b/app/assets/images/cached-icons/nayeon7lee_logo.png differ diff --git a/app/assets/images/cached-icons/nyu-mll_logo.png b/app/assets/images/cached-icons/nyu-mll_logo.png new file mode 100644 index 0000000..d9b3679 Binary files /dev/null and b/app/assets/images/cached-icons/nyu-mll_logo.png differ diff --git a/app/assets/images/cached-icons/open-compass_logo.png b/app/assets/images/cached-icons/open-compass_logo.png new file mode 100644 index 0000000..1474f96 Binary files /dev/null and b/app/assets/images/cached-icons/open-compass_logo.png differ diff --git a/app/assets/images/cached-icons/openai_logo.png b/app/assets/images/cached-icons/openai_logo.png new file mode 100644 index 0000000..eca73a4 Binary files /dev/null and b/app/assets/images/cached-icons/openai_logo.png differ diff --git a/app/assets/images/cached-icons/p-lambda_logo.png b/app/assets/images/cached-icons/p-lambda_logo.png new file mode 100644 index 0000000..af61633 Binary files /dev/null and b/app/assets/images/cached-icons/p-lambda_logo.png differ diff --git a/app/assets/images/cached-icons/princeton-nlp_logo.png b/app/assets/images/cached-icons/princeton-nlp_logo.png new file mode 100644 index 0000000..d00d04c Binary files /dev/null and b/app/assets/images/cached-icons/princeton-nlp_logo.png differ diff --git a/app/assets/images/cached-icons/rom1504_logo.png b/app/assets/images/cached-icons/rom1504_logo.png new file mode 100644 index 0000000..0df54dc Binary files /dev/null and b/app/assets/images/cached-icons/rom1504_logo.png differ diff --git a/app/assets/images/cached-icons/salesforce_logo.png b/app/assets/images/cached-icons/salesforce_logo.png new file mode 100644 index 0000000..a91bddf Binary files /dev/null and b/app/assets/images/cached-icons/salesforce_logo.png differ diff --git a/app/assets/images/cached-icons/sangmichaelxie_logo.png b/app/assets/images/cached-icons/sangmichaelxie_logo.png new file mode 100644 index 0000000..f51bb3b Binary files /dev/null and b/app/assets/images/cached-icons/sangmichaelxie_logo.png differ diff --git a/app/assets/images/cached-icons/sberdevices_logo.png b/app/assets/images/cached-icons/sberdevices_logo.png new file mode 100644 index 0000000..7c9c731 Binary files /dev/null and b/app/assets/images/cached-icons/sberdevices_logo.png differ diff --git a/app/assets/images/cached-icons/stanford-crfm_logo.png b/app/assets/images/cached-icons/stanford-crfm_logo.png new file mode 100644 index 0000000..d693bd6 Binary files /dev/null and b/app/assets/images/cached-icons/stanford-crfm_logo.png differ diff --git a/app/assets/images/cached-icons/stas00_logo.png b/app/assets/images/cached-icons/stas00_logo.png new file mode 100644 index 0000000..5d1e596 Binary files /dev/null and b/app/assets/images/cached-icons/stas00_logo.png differ diff --git a/app/assets/images/cached-icons/suzgunmirac_logo.png b/app/assets/images/cached-icons/suzgunmirac_logo.png new file mode 100644 index 0000000..d58c5dd Binary files /dev/null and b/app/assets/images/cached-icons/suzgunmirac_logo.png differ diff --git a/app/assets/images/cached-icons/swj0419_logo.png b/app/assets/images/cached-icons/swj0419_logo.png new file mode 100644 index 0000000..466d39d Binary files /dev/null and b/app/assets/images/cached-icons/swj0419_logo.png differ diff --git a/app/assets/images/cached-icons/thu-coai_logo.png b/app/assets/images/cached-icons/thu-coai_logo.png new file mode 100644 index 0000000..741c6c4 Binary files /dev/null and b/app/assets/images/cached-icons/thu-coai_logo.png differ diff --git a/app/assets/images/cached-icons/togethercomputer_logo.png b/app/assets/images/cached-icons/togethercomputer_logo.png new file mode 100644 index 0000000..11cc36b Binary files /dev/null and b/app/assets/images/cached-icons/togethercomputer_logo.png differ diff --git a/app/assets/images/cached-icons/unitaryai_logo.png b/app/assets/images/cached-icons/unitaryai_logo.png new file mode 100644 index 0000000..2db95d8 Binary files /dev/null and b/app/assets/images/cached-icons/unitaryai_logo.png differ diff --git a/app/assets/images/cached-icons/viswavi_logo.png b/app/assets/images/cached-icons/viswavi_logo.png new file mode 100644 index 0000000..334f05c Binary files /dev/null and b/app/assets/images/cached-icons/viswavi_logo.png differ diff --git a/app/assets/images/cached-icons/webdataset_logo.png b/app/assets/images/cached-icons/webdataset_logo.png new file mode 100644 index 0000000..94b6c29 Binary files /dev/null and b/app/assets/images/cached-icons/webdataset_logo.png differ diff --git a/app/assets/images/cached-icons/wenet-e2e_logo.png b/app/assets/images/cached-icons/wenet-e2e_logo.png new file mode 100644 index 0000000..ddafc97 Binary files /dev/null and b/app/assets/images/cached-icons/wenet-e2e_logo.png differ diff --git a/app/assets/images/favicon.png b/app/assets/images/favicon.png new file mode 100644 index 0000000..b4fca3f Binary files /dev/null and b/app/assets/images/favicon.png differ diff --git a/app/assets/images/fmcheatsheet-logo-dark.png b/app/assets/images/fmcheatsheet-logo-dark.png new file mode 100644 index 0000000..0cf5ca1 Binary files /dev/null and b/app/assets/images/fmcheatsheet-logo-dark.png differ diff --git a/app/assets/images/fmcheatsheet-logo-large.png b/app/assets/images/fmcheatsheet-logo-large.png new file mode 100644 index 0000000..a88c142 Binary files /dev/null and b/app/assets/images/fmcheatsheet-logo-large.png differ diff --git a/app/assets/images/fmcheatsheet-logo.png b/app/assets/images/fmcheatsheet-logo.png new file mode 100644 index 0000000..f057de8 Binary files /dev/null and b/app/assets/images/fmcheatsheet-logo.png differ diff --git a/app/assets/images/fmcheatsheet.png b/app/assets/images/fmcheatsheet.png new file mode 100644 index 0000000..1110f62 Binary files /dev/null and b/app/assets/images/fmcheatsheet.png differ diff --git a/app/assets/images/foundation-model-cheatsheet.png b/app/assets/images/foundation-model-cheatsheet.png new file mode 100644 index 0000000..a4dfba3 Binary files /dev/null and b/app/assets/images/foundation-model-cheatsheet.png differ diff --git a/app/resources/logos/cheatsheet-0.png b/app/assets/images/foundation-models.png similarity index 100% rename from app/resources/logos/cheatsheet-0.png rename to app/assets/images/foundation-models.png diff --git a/app/assets/images/home/foundation-training-flowchart.png b/app/assets/images/home/foundation-training-flowchart.png new file mode 100644 index 0000000..c0589ff Binary files /dev/null and b/app/assets/images/home/foundation-training-flowchart.png differ diff --git a/app/assets/images/home/pdf/cheatsheedpdf2.png b/app/assets/images/home/pdf/cheatsheedpdf2.png new file mode 100644 index 0000000..bb3755d Binary files /dev/null and b/app/assets/images/home/pdf/cheatsheedpdf2.png differ diff --git a/app/assets/images/home/pdf/cheatsheetpdf.png b/app/assets/images/home/pdf/cheatsheetpdf.png new file mode 100644 index 0000000..5b5f8e6 Binary files /dev/null and b/app/assets/images/home/pdf/cheatsheetpdf.png differ diff --git a/app/assets/images/no-search-found.png b/app/assets/images/no-search-found.png new file mode 100755 index 0000000..1e1e6e1 Binary files /dev/null and b/app/assets/images/no-search-found.png differ diff --git a/app/assets/images/resource-icons/github.png b/app/assets/images/resource-icons/github.png new file mode 100644 index 0000000..6bd03ac Binary files /dev/null and b/app/assets/images/resource-icons/github.png differ diff --git a/app/assets/images/resource-icons/huggingface.png b/app/assets/images/resource-icons/huggingface.png new file mode 100644 index 0000000..93dfa15 Binary files /dev/null and b/app/assets/images/resource-icons/huggingface.png differ diff --git a/app/assets/images/resource-icons/paper.png b/app/assets/images/resource-icons/paper.png new file mode 100644 index 0000000..71031c6 Binary files /dev/null and b/app/assets/images/resource-icons/paper.png differ diff --git a/app/assets/images/supporters/supporters.png b/app/assets/images/supporters/supporters.png new file mode 100644 index 0000000..00760df Binary files /dev/null and b/app/assets/images/supporters/supporters.png differ diff --git a/app/assets/resources.json b/app/assets/resources.json new file mode 100644 index 0000000..2746c98 --- /dev/null +++ b/app/assets/resources.json @@ -0,0 +1,4823 @@ +[ + { + "name": "BigBench Hard", + "description": "A challenging subset of 23 BigBench tasks where at time of release models did not outperform annotator performance.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "10-2022", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2210.09261", + "website_link": "", + "github_link": "https://github.com/suzgunmirac/BIG-Bench-Hard", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "suzgunmirac_logo.png" + }, + { + "name": "BigCode Evaluation Harness", + "description": "A framework for the evaluation of code generation models, compiling many evaluation sets.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "bigcode-project_logo.png" + }, + { + "name": "CLIP benchmark", + "description": "Image classification, retrieval and captioning", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Capabilities" + ], + "date": "4-2022", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/LAION-AI/CLIP_benchmark", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "LAION-AI_logo.png" + }, + { + "name": "DataComp eval suite", + "description": "38 image classification and retrieval downstream tasks", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Capabilities" + ], + "date": "4-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2304.14108", + "website_link": "https://www.datacomp.ai/", + "github_link": "https://github.com/mlfoundations/datacomp#evaluation", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "HEIM", + "description": "A large suite of text-to-image evaluations. Useful for thorough capability analysis of these model types.", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Capabilities" + ], + "date": "11-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://crfm.stanford.edu/heim/v1.1.0/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "HELM classic", + "description": "A large suite of benchmarks and metric types, to holistically evaluate many model qualities aside from performance on general tasks. Useful for a thorough comparison against other well known models.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "11-2022", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2211.09110", + "website_link": "https://crfm.stanford.edu/helm/latest/", + "github_link": "https://github.com/stanford-crfm/helm", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "stanford-crfm_logo.png" + }, + { + "name": "Hugging Face Leaderboards Collection", + "description": "A collection of unique leaderboards on Hugging Face for ranking models across modalities and tasks.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Capabilities" + ], + "date": "Frequently Updated", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "https://huggingface.co/blog?tag=leaderboard", + "github_link": "", + "huggingface_link": "https://huggingface.co/collections/clefourrier/leaderboards-and-benchmarks-64f99d2e11e92ca5568a7cce", + "added_by": "Original Authors" + }, + { + "name": "HumanEvalPack", + "description": "HumanEvalPack is a code evaluation benchmark across 6 languages and 3 tasks, extending OpenAI's HumanEval.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "8-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2308.07124", + "website_link": "", + "github_link": "https://github.com/bigcode-project/octopack", + "huggingface_link": "https://huggingface.co/datasets/bigcode/humanevalpack", + "added_by": "Original Authors", + "logo": "bigcode-project_logo.png" + }, + { + "name": "Lighteval", + "description": "Small, highly configurable LLM evaluation library, for fast experimentation and iteration.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/huggingface/lighteval", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "huggingface_logo.png" + }, + { + "name": "LM Evaluation Harness", + "description": "Orchestration framework for standardizing LM prompted evaluation, supporting hundreds of subtasks.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities", + "Reproducibility" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/EleutherAI/lm-evaluation-harness", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "EleutherAI_logo.png" + }, + { + "name": "LMSys Chatbot Arena", + "description": "A leaderboard of models based on Elo ratings where humans or models select their preferred response between two anonymous models. Chatbot Arena, MT-Bench, and 5-shot MMLU are used as benchmarks. This resource provides a general purpose, and GPT-4 biased perspective into model capabilities.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "Frequently Updated", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2306.05685", + "website_link": "", + "github_link": "https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md", + "huggingface_link": "https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard", + "added_by": "Original Authors", + "logo": "lm-sys_logo.png" + }, + { + "name": "MMBench", + "description": "A joint vision and text benchmark evaluating dozens of capabilities, using curated datasets and ChatGPT in the loop.", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Capabilities" + ], + "date": "7-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2307.06281", + "website_link": "https://opencompass.org.cn/mmbench", + "github_link": "https://github.com/open-compass/MMBench", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "open-compass_logo.png" + }, + { + "name": "MME", + "description": "An evaluation benchmark for multimodal large language models with 14 manually curated subtasks, to avoid data leakage.", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Capabilities" + ], + "date": "6-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2306.13394", + "website_link": "", + "github_link": "https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "BradyFU_logo.png" + }, + { + "name": "MTEB", + "description": "The Massive Text Embedding Benchmark measures the quality of embeddings across 58 datasets and 112 languages for tasks related to retrieval, classification, clustering or semantic similarity.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "10-2022", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2210.07316", + "website_link": "", + "github_link": "https://github.com/embeddings-benchmark/mteb", + "huggingface_link": "https://huggingface.co/spaces/mteb/leaderboard", + "added_by": "Original Authors", + "logo": "embeddings-benchmark_logo.png" + }, + { + "name": "OpenASR Leaderboard", + "description": "An automatic leaderboard ranking and evaluating speech recognition models on common benchmarks.", + "modalities": [ + "Speech" + ], + "categories": [ + "Capabilities" + ], + "date": "Frequently Updated", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/huggingface/open_asr_leaderboard", + "huggingface_link": "https://huggingface.co/spaces/hf-audio/open_asr_leaderboard", + "added_by": "Original Authors", + "logo": "huggingface_logo.png" + }, + { + "name": "OpenFlamingo eval suite", + "description": "VQA, captioning, classification", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Capabilities" + ], + "date": "8-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2308.01390", + "website_link": "", + "github_link": "https://github.com/mlfoundations/open_flamingo/tree/main/open_flamingo/eval", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "Open LLM Leaderboard", + "description": "A popular leaderboard on Hugging Face for ranking open LLMs on their knowledge, reasoning and math capabilities.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "Frequently Updated", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/open-llm-leaderboard", + "added_by": "Original Authors" + }, + { + "name": "SWE Bench", + "description": "SWE-bench is a benchmark for evaluating large language models on real world software issues collected from GitHub. Given a codebase and an issue, a language model is tasked with generating a patch that resolves the described problem.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "10-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2310.06770", + "website_link": "https://www.swebench.com/", + "github_link": "https://github.com/princeton-nlp/SWE-bench", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "princeton-nlp_logo.png" + }, + { + "name": "The Edinburgh International Accents of English Corpus", + "description": "Benchmark dataset of diverse English varieties for evaluating automatic speech recognition models (typically trained and tested only on US English)", + "modalities": [ + "Speech" + ], + "categories": [ + "Capabilities" + ], + "date": "3-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2303.18110", + "website_link": "https://groups.inf.ed.ac.uk/edacc/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "HELM lite", + "description": "A lightweight subset of capability-centric benchmarks within HELM with comparisons to many prominent open and closed models.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "https://crfm.stanford.edu/2023/12/19/helm-lite.html", + "website_link": "https://crfm.stanford.edu/helm/lite/latest/#/", + "github_link": "https://github.com/stanford-crfm/helm", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "stanford-crfm_logo.png" + }, + { + "name": "MMMU", + "description": "A benchmark to evaluate joint text and vision models on 11k examples spanning 30 college-level subject domains.", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Capabilities" + ], + "date": "11-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2311.16502", + "website_link": "https://mmmu-benchmark.github.io/", + "github_link": "https://github.com/MMMU-Benchmark/MMMU", + "huggingface_link": "https://huggingface.co/datasets/MMMU/MMMU", + "added_by": "Original Authors", + "logo": "MMMU-Benchmark_logo.png" + }, + { + "name": "Anaconda", + "description": "An environment and dependency management tool.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Reproducibility" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.anaconda.com/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Colab Notebooks", + "description": "A tool to execute and share reproducible code snippets.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Reproducibility" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://colab.research.google.com/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Docker", + "description": "An environment and dependency management tool.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Reproducibility" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://docker-curriculum.com/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Jupyter Notebooks", + "description": "A tool to execute and share reproducible code snippets.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Reproducibility" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://jupyter.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Semver", + "description": "A widely used protcol for versioning to software, to ensure easy reproducibility.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Reproducibility" + ], + "date": "", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://semver.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Reforms", + "description": "Reporting Standards for ML-based Science.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Reproducibility" + ], + "date": "8-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2308.07832", + "website_link": "https://reforms.cs.princeton.edu/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "A Retrospective Datasheet for BookCorpus", + "description": "A third party datasheet for BookCorpus", + "modalities": [ + "Text" + ], + "categories": [ + "Data Auditing" + ], + "date": "5-2021", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2105.05241", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Data Provenance Initiative", + "description": "A large scale audit of 2000+ popular datasets in AI.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Auditing" + ], + "date": "Frequently Updated", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2310.16787", + "website_link": "https://www.dataprovenance.org/", + "github_link": "https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection", + "huggingface_link": "https://huggingface.co/DataProvenanceInitiative", + "added_by": "Original Authors", + "logo": "Data-Provenance-Initiative_logo.png" + }, + { + "name": "Datasheet for the Pile", + "description": "A datasheet for the Pile", + "modalities": [ + "Text" + ], + "categories": [ + "Data Auditing" + ], + "date": "1-2022", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2201.07311", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "HaveIBeenTrained", + "description": "A combination search tool / opt out tool for LAION", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Data Auditing", + "Data Governance" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://haveibeentrained.com/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Into the LAIONs Den", + "description": "Auditing hateful content in text-to-vision datasets.", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Data Auditing" + ], + "date": "9-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2311.03449", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Multimodal datasets: misogyny, pornography, and malignant stereotypes", + "description": "Auditing vision datasets for sensitive content.", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Data Auditing" + ], + "date": "10-2021", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2110.01963", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "On Hate Scaling Laws For Data-Swamps", + "description": "Auditing text and vision datasets for systemic biases and hate.", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Data Auditing" + ], + "date": "6-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2306.13141", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Quality at a Glance", + "description": "An audit of allegedly multilingual parallel text corpora.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Auditing" + ], + "date": "3-2021", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2103.12028", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Training Data Transparency Blog", + "description": "A blog on transparency for training data in AI.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Auditing" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://huggingface.co/blog/yjernite/data-transparency", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Data Selection via Importance Resampling (DSIR)", + "description": "A tool for selecting data with a similar distribution to a target dataset", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "12-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2302.03169", + "website_link": "", + "github_link": "https://github.com/p-lambda/dsir", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "p-lambda_logo.png" + }, + { + "name": "DataComp filtering", + "description": "Various quality filters", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Data Cleaning" + ], + "date": "4-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2304.14108", + "website_link": "https://www.datacomp.ai/", + "github_link": "https://github.com/mlfoundations/datacomp/tree/main#baselines", + "huggingface_link": "https://huggingface.co/datasets/mlfoundations/datacomp_1b", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "DataComp pre-filtering", + "description": "NSFW detection, dedup with eval datasets", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Data Cleaning" + ], + "date": "4-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2304.14108", + "website_link": "https://www.datacomp.ai/", + "github_link": "https://github.com/mlfoundations/dataset2metadata", + "huggingface_link": "https://huggingface.co/datasets/mlfoundations/datacomp_1b", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "Detoxify", + "description": "A python library designed to identify toxic language in comments. Functions in seven languages: English, Italian, French, Russian, Portuguese, Spanish, Turking.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/unitaryai/detoxify", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "unitaryai_logo.png" + }, + { + "name": "Dolma's Toolkit", + "description": "A Python framework for defining Taggers that identify non-language text, language ID, PII, toxic text, and \"quality\" text. Includes reimplementation of heuristics used by Gopher and C4 for non-natural language.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "8-2023", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/allenai/dolma", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "allenai_logo.png" + }, + { + "name": "DoReMi", + "description": "A github repository for Domain Reweighting with Minimax Optimization", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "5-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2305.10429", + "website_link": "", + "github_link": "https://github.com/sangmichaelxie/doremi", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "sangmichaelxie_logo.png" + }, + { + "name": "fastText language classifier", + "description": "A tool for classifying the language of text", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "5-2023", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/facebook/fasttext-language-identification", + "added_by": "Original Authors" + }, + { + "name": "FUN-LangID", + "description": "Frequently Used N-grams Language ID model, a character 4-gram model trained to recognize up to 1633 languages.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "9-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/google-research/url-nlp/tree/main/fun-langid", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "google-research_logo.png" + }, + { + "name": "GlotLID", + "description": "A model for identifying languages, with support for more than 1600 languages.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2310.16248", + "website_link": "", + "github_link": "https://github.com/cisnlp/GlotLID", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "cisnlp_logo.png" + }, + { + "name": "Langdetect", + "description": "A tool to predict the language of text, used to filter out/in data from the desired languages", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "5-2021", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://pypi.org/project/langdetect/", + "github_link": "https://github.com/Mimino666/langdetect", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "Mimino666_logo.png" + }, + { + "name": "Lilac", + "description": "A python package for better understanding your data. Includes keyword and semantic search, as well as detection for PII, duplicates, and language.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "9-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://www.lilacml.com/", + "github_link": "https://github.com/lilacai/lilac", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "lilacai_logo.png" + }, + { + "name": "Online Data Mixing", + "description": "A github repository for efficient online data mixing", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "12-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2312.02406", + "website_link": "", + "github_link": "https://github.com/alon-albalak/online-data-mixing", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "alon-albalak_logo.png" + }, + { + "name": "OpenLID", + "description": "A model (and data used to train the model) for identifying 200+ languages.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2305.13820", + "website_link": "", + "github_link": "https://github.com/laurieburchell/open-lid-dataset", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "laurieburchell_logo.png" + }, + { + "name": "Roots data cleaning pipeline", + "description": "A pipeline for processing and improving quality of crowdsourced datasets", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "10-2022", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/01a_catalogue_cleaning_and_filtering", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "bigscience-workshop_logo.png" + }, + { + "name": "SpeechBrain\u2019s Spoken language ID model", + "description": "Pre-trained spoken language identification model trained on VoxLingua107, dataset of audio sourced from YouTube for 107 languages", + "modalities": [ + "Speech" + ], + "categories": [ + "Data Cleaning" + ], + "date": "6-2021", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2106.04624", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/speechbrain/lang-id-voxlingua107-ecapa", + "added_by": "Original Authors" + }, + { + "name": "The Pile processing scripts", + "description": "A series of scripts to replicate the Pile dataset. Includes filtering and cleaning for: language, profanity, deduplication, and test set decontamination.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "12-2020", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/EleutherAI/the-pile/tree/master/processing_scripts", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "EleutherAI_logo.png" + }, + { + "name": "BigBench Canaries", + "description": "BigBench's \"Training on the Test Set\" Task provies guidance on using canaries to check if an evaluation set was trained on.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Decontamination" + ], + "date": "10-2021", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/google/BIG-bench/blob/main/bigbench/benchmark_tasks/training_on_test_set/README.md#training-on-the-test-set", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "google_logo.png" + }, + { + "name": "Carper AI Decontamination Tool", + "description": "A repository, heavily based by the BigCode repository, to decontaminate evaluation sets from a text training set.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Decontamination" + ], + "date": "1-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/CarperAI/decontamination/tree/main", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "CarperAI_logo.png" + }, + { + "name": "Data Portraits", + "description": "A tool to test for membership inference of popular datasets, like The Pile or The Stack, i.e. whether a model has seen certain data.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Decontamination" + ], + "date": "3-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2303.03919", + "website_link": "https://dataportraits.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Detect Pretrain Data (Min-K Prob)", + "description": "Detect Pretrain Data (Min-K Prob)", + "modalities": [ + "Text" + ], + "categories": [ + "Data Decontamination" + ], + "date": "11-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2310.16789", + "website_link": "https://swj0419.github.io/detect-pretrain.github.io/", + "github_link": "https://github.com/swj0419/detect-pretrain-code", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "swj0419_logo.png" + }, + { + "name": "Interpreting Canary Exposure", + "description": "An explanation on how to interpret canary exposure, including by relating it to membership inference attacks, and differential privacy.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Decontamination" + ], + "date": "5-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2306.00133", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Proving Test Set Contamination in Black Box Language Models", + "description": "A paper that provides methods for provable guarantees of test set contamination in language models without access to pretraining data or model weights.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Decontamination" + ], + "date": "10-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2310.17623", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Apricot", + "description": "apricot implements submodular optimization for the purpose of summarizing massive data sets into minimally redundant subsets that are still representative of the original data. These subsets are useful for both visualizing the modalities in the data (such as in the two data sets below) and for training accurate machine learning models with just a fraction of the examples and compute.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Deduplication" + ], + "date": "7-1905", + "primary_link": "GitHub", + "paper_link": "https://dl.acm.org/doi/abs/10.5555/3455716.3455877", + "website_link": "", + "github_link": "https://github.com/jmschrei/apricot", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "jmschrei_logo.png" + }, + { + "name": "Datacomp image dedup", + "description": "Data to deduplicate vision datasets for the Datacomp challenge.", + "modalities": [ + "Vision" + ], + "categories": [ + "Data Deduplication" + ], + "date": "8-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://www.datacomp.ai/", + "github_link": "https://github.com/mlfoundations/dataset2metadata", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "Dolma Dedupe Tool", + "description": "Dolma's text deduplication tool for pretraining data", + "modalities": [ + "Text" + ], + "categories": [ + "Data Deduplication" + ], + "date": "10-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/allenai/dolma", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "allenai_logo.png" + }, + { + "name": "Google Text Deduplication", + "description": "A repository to deduplicate language model datasets. They release the ExactSubstr deduplication implementation (written in Rust) along with scripts to perform ExactSubstr deduplication and inspect the results (written in Python). They also release the document clusters resulting from running NearDup deduplication on C4, RealNews, LM1B, and Wiki-4B-en.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Deduplication" + ], + "date": "7-2021", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2107.06499", + "website_link": "", + "github_link": "https://github.com/google-research/deduplicate-text-datasets", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "google-research_logo.png" + }, + { + "name": "RedPajama-Data", + "description": "Tools for: exact deduplication with bloom filter, fuzzy deduplication with LSH, calculating quality scores", + "modalities": [ + "Text" + ], + "categories": [ + "Data Deduplication" + ], + "date": "10-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/togethercomputer/RedPajama-Data", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "togethercomputer_logo.png" + }, + { + "name": "Pile", + "description": "A set of tools for deduplication with MinHashLSH", + "modalities": [ + "Text" + ], + "categories": [ + "Data Deduplication" + ], + "date": "5-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2101.00027", + "website_link": "", + "github_link": "https://huggingface.co/datasets/EleutherAI/pile-standard-pythia-preshuffled", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "datasets_logo.png" + }, + { + "name": "Data Cards Playbook", + "description": "A tool to create a Data Card that thoroughly documents a new dataset.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Documentation" + ], + "date": "6-2022", + "primary_link": "Webpage", + "paper_link": "https://dl.acm.org/doi/fullHtml/10.1145/3531146.3533231", + "website_link": "https://sites.research.google/datacardsplaybook/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Data Nutrition Labels", + "description": "A generic but thorough form of dataset documentation.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Documentation" + ], + "date": "2020", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/1805.03677", + "website_link": "https://datanutrition.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Data Provenance Attribution Card", + "description": "A repository to select datasets and generate a summary. It can also generate a bibtex to attribute all developers of the datasets.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Documentation" + ], + "date": "10-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2310.16787", + "website_link": "https://www.dataprovenance.org/", + "github_link": "https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "Data-Provenance-Initiative_logo.png" + }, + { + "name": "Data Statements", + "description": "A data statement to thoroughly document a new dataset.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Documentation" + ], + "date": "2018", + "primary_link": "Paper", + "paper_link": "https://aclanthology.org/Q18-1041/", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Datasheets for Datasets", + "description": "A datasheet to thoroughly document a new dataset.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Documentation" + ], + "date": "3-2018", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/1803.09010", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Datasheets for Digital Cultural Heritage Datasets", + "description": "A datasheet specifically designed for digital cultural heritage datasets and their considerations.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Documentation" + ], + "date": "2023", + "primary_link": "Paper", + "paper_link": "https://cris.unibo.it/handle/11585/947893", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Data Governance in the Age of Large-Scale Data-Driven Language Technology", + "description": "A paper detailing the data governance decisions undertaken during BigScience's BLOOM project. ", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Governance" + ], + "date": "5-2022", + "primary_link": "Paper", + "paper_link": "https://dl.acm.org/doi/abs/10.1145/3531146.3534637", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/spaces/bigscience-data/roots-search", + "added_by": "Original Authors" + }, + { + "name": "Reclaiming the Digital Commons: A Public Data Trust for Training Data", + "description": "A paper that argues for the creation of a public data trust for collective input into the creation of AI systems and analyzes the feasibility of such a data trust.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Governance" + ], + "date": "3-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2303.09001", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "BigCode Governance Card", + "description": "A report outlining governance questions, approaches, and tooling in the BigCode project, with a focus on Data governance", + "modalities": [ + "Text" + ], + "categories": [ + "Data Governance" + ], + "date": "11-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2312.03872", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "AmIinTheStack", + "description": "A tool to let software developers check whether their code was included in TheStack dataset and opt out of inclusion in future versions", + "modalities": [ + "Text" + ], + "categories": [ + "Data Governance" + ], + "date": "9-2022", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/spaces/bigcode/in-the-stack", + "added_by": "Original Authors" + }, + { + "name": "StarPII: BigCode Pseudonymization Model", + "description": "A model trained on a new dataset of PII in code used for pseudonymization of a dataset prior to training", + "modalities": [ + "Text" + ], + "categories": [ + "Data Governance" + ], + "date": "4-2023", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/bigcode/starpii", + "added_by": "Original Authors" + }, + { + "name": "French DPA Resource sheets on AI and GDPR", + "description": "A set of resource sheets focused on GDPR compliance covering legal basis for data collection, sharing, and best practices for handling personal data", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Data Governance" + ], + "date": "10-2023", + "primary_link": "Webpage", + "paper_link": "https://www.cnil.fr/en/ai-how-sheets", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "AI2 C4 Search Tool", + "description": "A search tool that lets users to execute full-text queries to search Google's C4 Dataset.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "7-1905", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://c4-search.apps.allenai.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Data Finder", + "description": "A tool to help build search over academic datasets given a natural language description of the idea.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "5-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2305.16636", + "website_link": "", + "github_link": "https://github.com/viswavi/datafinder", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "viswavi_logo.png" + }, + { + "name": "Data Provenance Explorer", + "description": "An explorer tool for selecting, filtering, and visualizing popular finetuning, instruction, and alignment training datasets from Hugging Face, based on their metadata such as source, license, languages, tasks, topics, among other properties.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2310.16787", + "website_link": "https://www.dataprovenance.org/", + "github_link": "https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection", + "huggingface_link": "https://huggingface.co/DataProvenanceInitiative", + "added_by": "Original Authors", + "logo": "Data-Provenance-Initiative_logo.png" + }, + { + "name": "GAIA Search Tool", + "description": "A search tool over C4, the Pile, ROOTS, and the text captions of LAION, developed with Pyserini (https://github.com/castorini/pyserini).", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "6-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2306.01481", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/spaces/spacerini/gaia", + "added_by": "Original Authors" + }, + { + "name": "Hugging Face Data Measurements Tool", + "description": "A tool to analyze, measure, and compare properties of text finetuning data, including their distributional statistics, lengths, and vocabularies.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "7-1905", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/spaces/huggingface/data-measurements-tool", + "added_by": "Original Authors" + }, + { + "name": "Know your data", + "description": "A tool for exploring over 70 vision datasets", + "modalities": [ + "Vision" + ], + "categories": [ + "Data Exploration" + ], + "date": "5-2021", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://knowyourdata-tfds.withgoogle.com/", + "github_link": "https://github.com/PAIR-code/knowyourdata", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "PAIR-code_logo.png" + }, + { + "name": "LAION search", + "description": "Nearest neighbor search based on CLIP embeddings", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Data Exploration" + ], + "date": "3-2022", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://rom1504.github.io/clip-retrieval/", + "github_link": "https://github.com/rom1504/clip-retrieval", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "rom1504_logo.png" + }, + { + "name": "NVIDIA Speech Data Explorer", + "description": "Tool for exploring speech data", + "modalities": [ + "Speech" + ], + "categories": [ + "Data Exploration" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/tools/speech_data_explorer.html", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "ROOTS Search Tool", + "description": "A tool, based on a BM25 index, to search over text for each language or group of languages included in the ROOTS pretraining dataset.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "7-1905", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/spaces/bigscience-data/roots-search", + "added_by": "Original Authors" + }, + { + "name": "What's In My Big Data?", + "description": "A platform for analyzing large text datasets at scale", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "10-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2310.20707", + "website_link": "https://wimbd.apps.allenai.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "WIMBD", + "description": "A dataset analysis tool to count, search, and compare attributes across several massive pretraining corpora at scale, including C4, The Pile, and RedPajama.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "11-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2310.20707", + "website_link": "https://wimbd.apps.allenai.org/", + "github_link": "https://github.com/allenai/wimbd", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "allenai_logo.png" + }, + { + "name": "Everything about Distributed Training and Efficient Finetuning", + "description": "A rundown and crash course in distributed training for deep learning, with an eye toward LLM finetuning and current useful tools and resources. Provides a good overview of the various (distributed) training strategies for efficient and scalable training.", + "modalities": [ + "Text" + ], + "categories": [ + "Additional Educational Resources" + ], + "date": "10-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://sumanthrh.com/post/distributed-and-efficient-finetuning/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Machine Learning Engineering Online Book", + "description": "An \"online textbook\" and resource collection on ML engineering at scale, ranging from debugging distributed systems, parallelism strategies, effective use of large HPC clusters, and chronicles of past large-scale training runs with lessons learned.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Additional Educational Resources" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/stas00/ml-engineering", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "stas00_logo.png" + }, + { + "name": "nanoGPT", + "description": "A minimal, stripped-down training codebase for teaching purposes and easily-hackable yet performant small-scale training.", + "modalities": [ + "Text" + ], + "categories": [ + "Additional Educational Resources" + ], + "date": "12-2022", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/karpathy/nanoGPT", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "karpathy_logo.png" + }, + { + "name": "The EleutherAI Model Training Cookbook", + "description": "A set of resources on how to train large scale AI systems", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Additional Educational Resources" + ], + "date": "12-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/EleutherAI/cookbook", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "EleutherAI_logo.png" + }, + { + "name": "Transformer Inference Arithmetic", + "description": "A blog post on the inference costs of transformer-based LMs. Useful for providing more insight into deep learning accelerators and inference-relevant decisions to make when training a model.", + "modalities": [ + "Text" + ], + "categories": [ + "Additional Educational Resources" + ], + "date": "3-2022", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://kipp.ly/transformer-inference-arithmetic/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Transformer Math 101", + "description": "An introductory blog post on training costs of LLMs, going over useful formulas and considerations from a high to low level", + "modalities": [ + "Text" + ], + "categories": [ + "Additional Educational Resources" + ], + "date": "4-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://blog.eleuther.ai/transformer-math/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Azure Emissions Impact Dashboard", + "description": "Monitoring the environmental impact of training machine learning models on Azure", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Environmental Impact" + ], + "date": "10-2021", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.microsoft.com/en-us/sustainability/emissions-impact-dashboard", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Carbontracker", + "description": "carbontracker is a tool for tracking and predicting the energy consumption and carbon footprint of training deep learning models as described in Anthony et al. (2020).", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Environmental Impact" + ], + "date": "7-2020", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2007.03051", + "website_link": "", + "github_link": "https://github.com/lfwa/carbontracker", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "lfwa_logo.png" + }, + { + "name": "CodeCarbon", + "description": "Estimate and track carbon emissions from your computer, quantify and analyze their impact.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Environmental Impact" + ], + "date": "11-2020", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://mlco2.github.io/codecarbon/", + "github_link": "https://github.com/mlco2/codecarbon", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "mlco2_logo.png" + }, + { + "name": "Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model", + "description": "A comprehensive account of the broader environmental impact of the BLOOM language model.", + "modalities": [ + "Text" + ], + "categories": [ + "Environmental Impact" + ], + "date": "6-2023", + "primary_link": "Paper", + "paper_link": "https://jmlr.org/papers/v24/23-0069.html", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Experiment Impact Tracker", + "description": "The experiment-impact-tracker is meant to be a simple drop-in method to track energy usage, carbon emissions, and compute utilization of your system. Currently, on Linux systems with Intel chips (that support the RAPL or powergadget interfaces) and NVIDIA GPUs, we record: power draw from CPU and GPU, hardware information, python package versions, estimated carbon emissions information, etc. In California we even support realtime carbon emission information by querying caiso.com!", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Environmental Impact" + ], + "date": "1-2020", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2002.05651", + "website_link": "", + "github_link": "https://github.com/Breakend/experiment-impact-tracker", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "Breakend_logo.png" + }, + { + "name": "Google Cloud Carbon Footprint Measurement", + "description": "Tracking the emissions of using Google's cloud compute resources", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Environmental Impact" + ], + "date": "10-2021", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://cloud.google.com/carbon-footprint?hl=en", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Making AI Less \"Thirsty\"", + "description": "Uncovering and Addressing the Secret Water Footprint of AI Models, and estimating water usage for training and deploying LLMs.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Environmental Impact" + ], + "date": "4-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2304.03271", + "website_link": "", + "github_link": "https://github.com/Ren-Research/Making-AI-Less-Thirsty", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "Ren-Research_logo.png" + }, + { + "name": "ML CO2 Impact", + "description": "A tool for estimating carbon impacts of ML training", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Environmental Impact" + ], + "date": "10-2019", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/1910.09700", + "website_link": "https://mlco2.github.io/impact/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Scaling Laws for Neural Language Models", + "description": "Provide scaling laws to determine the optimal allocation of a fixed compute budget.", + "modalities": [ + "Text" + ], + "categories": [ + "Environmental Impact", + "Efficiency & Resource Allocation" + ], + "date": "1-2020", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2001.08361", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Training Compute-Optimal Large Language Models", + "description": "Provides details on the optimal model size and number of tokens for training a transformer-based language model in a given computational budget.", + "modalities": [ + "Text" + ], + "categories": [ + "Environmental Impact" + ], + "date": "3-2022", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2203.15556", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "AI4Bh\u0101rat Indic NLP", + "description": "A repository of Indian language text and speech resources, including datasets.", + "modalities": [ + "Text", + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://ai4bharat.iitm.ac.in/", + "github_link": "https://github.com/AI4Bharat", + "huggingface_link": "https://huggingface.co/ai4bharat", + "added_by": "Original Authors", + "logo": "AI4Bharat_logo.png" + }, + { + "name": "Arabic NLP Data Catalogue", + "description": "A catalogue of hundreds of Arabic text and speech finetuning datasets, regularly updated.", + "modalities": [ + "Text", + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://arbml.github.io/masader/", + "github_link": "https://github.com/ARBML", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "ARBML_logo.png" + }, + { + "name": "CHiME-5", + "description": "Speaker Diarization dataset comprising over 50 hours of conversational speech recordings collected from twenty real dinner parties that have taken place in real homes", + "modalities": [ + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "7-1905", + "primary_link": "Webpage", + "paper_link": "https://licensing.sheffield.ac.uk/product/chime5/print", + "website_link": "https://licensing.sheffield.ac.uk/product/chime5", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Data Provenance Collection", + "description": "A repository and explorer tool for selecting popular finetuning, instruction, and alignment training datasets from Hugging Face, based on data provenance and characteristics criteria.", + "modalities": [ + "Text" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2310.16787", + "website_link": "https://www.dataprovenance.org/", + "github_link": "https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection", + "huggingface_link": "https://huggingface.co/DataProvenanceInitiative", + "added_by": "Original Authors", + "logo": "Data-Provenance-Initiative_logo.png" + }, + { + "name": "ImageNet", + "description": "An image classification dataset with 1.3M samples and 1000 classes", + "modalities": [ + "Vision" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "6-2009", + "primary_link": "Webpage", + "paper_link": "https://ieeexplore.ieee.org/abstract/document/5206848", + "website_link": "https://www.image-net.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Indonesian NLP Data Catalogue", + "description": "A respository of hundreds of Indonesian language datasets.", + "modalities": [ + "Text", + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://indonlp.github.io/nusa-catalogue/", + "github_link": "https://github.com/IndoNLP/nusa-crowd", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "IndoNLP_logo.png" + }, + { + "name": "Lanfrica", + "description": "An online catalogue that provides links to African language resources (papers and datasets) in both texts and speech", + "modalities": [ + "Text", + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://lanfrica.com/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Masakhane NLP", + "description": "A repository of African language text and speech resources, including datasets.", + "modalities": [ + "Text", + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://www.masakhane.io/", + "github_link": "https://github.com/masakhane-io", + "huggingface_link": "https://huggingface.co/masakhane", + "added_by": "Original Authors", + "logo": "masakhane-io_logo.png" + }, + { + "name": "MS COCO", + "description": "Object detection, segmentation, captioning and retrieval dataset", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "5-2014", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/1405.0312", + "website_link": "https://cocodataset.org/#home", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "OpenSLR", + "description": "A collection of user-contributed datasets for various speech processing tasks", + "modalities": [ + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.openslr.org/resources.php", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "SEACrowd", + "description": "A repository of hundreds of South East Asian language datasets.", + "modalities": [ + "Text", + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://seacrowd.github.io/seacrowd-catalogue/", + "github_link": "https://github.com/SEACrowd", + "huggingface_link": "https://huggingface.co/NusaCrowd", + "added_by": "Original Authors", + "logo": "SEACrowd_logo.png" + }, + { + "name": "VoxCeleb", + "description": "Speaker Identification dataset comprising of YouTube interviews from thousands of celebrities", + "modalities": [ + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "6-2017", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/1706.08612", + "website_link": "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "VoxLingua107", + "description": "Spoken language identification dataset created using audio extracted from YouTube videos retrieved using language-specific search phrases", + "modalities": [ + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "11-2020", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2011.12998", + "website_link": "https://bark.phon.ioc.ee/voxlingua107/", + "github_link": "", + "huggingface_link": "https://huggingface.co/speechbrain/lang-id-voxlingua107-ecapa", + "added_by": "Original Authors" + }, + { + "name": "Zenodo AfricaNLP Community", + "description": "An online catalogue that provides African language resources (data and models) in both texts and speech", + "modalities": [ + "Text", + "Speech" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://zenodo.org/communities/africanlp", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Axolotl", + "description": "A repository for chat- or instruction-tuning language models, including through full fine-tuning, LoRA, QLoRA, and GPTQ.", + "modalities": [ + "Text" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/OpenAccess-AI-Collective/axolotl", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "OpenAccess-AI-Collective_logo.png" + }, + { + "name": "BLIP-2", + "description": "Fine-tuned LLMs on multimodal data using a projection layer", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "1-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2301.12597", + "website_link": "", + "github_link": "https://github.com/salesforce/LAVIS/tree/main/projects/blip2", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "salesforce_logo.png" + }, + { + "name": "LLaMA-Adapter", + "description": "Fine-tuned LLMs on multimodal data using adapters", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "3-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2304.15010", + "website_link": "", + "github_link": "https://github.com/OpenGVLab/LLaMA-Adapter", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "OpenGVLab_logo.png" + }, + { + "name": "LLaMA-Factory", + "description": "A framework for efficiently fine-tuning LLMs using cutting-edge algorithms with a user-friendly web UI", + "modalities": [ + "Text" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2403.13372", + "website_link": "", + "github_link": "https://github.com/hiyouga/LLaMA-Factory", + "huggingFace_link": "", + "added_by": "Original Authors" + }, + { + "name": "LLaVA", + "description": "Fine-tuned LLMs on multimodal data using a projection layer", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "4-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2310.03744", + "website_link": "https://llava-vl.github.io/", + "github_link": "https://github.com/haotian-liu/LLaVA", + "huggingface_link": "https://huggingface.co/spaces/badayvedat/LLaVA", + "added_by": "Original Authors", + "logo": "haotian-liu_logo.png" + }, + { + "name": "MiniGPT4", + "description": "Fine-tuned LLMs on multimodal data using a projection layer", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "4-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2304.10592", + "website_link": "https://minigpt-4.github.io/", + "github_link": "https://github.com/Vision-CAIR/MiniGPT-4", + "huggingface_link": "https://huggingface.co/spaces/Vision-CAIR/minigpt4", + "added_by": "Original Authors", + "logo": "Vision-CAIR_logo.png" + }, + { + "name": "OpenFlamingo", + "description": "Open source implementation of Flamingo", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "3-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2308.01390", + "website_link": "https://laion.ai/blog/open-flamingo-v2/", + "github_link": "https://github.com/mlfoundations/open_flamingo", + "huggingface_link": "https://huggingface.co/openflamingo", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "Otter", + "description": "Multimodal models with Flamingo architecture", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "4-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2311.04219", + "website_link": "", + "github_link": "https://github.com/Luodian/Otter", + "huggingface_link": "https://huggingface.co/spaces/Otter-AI/OtterHD-Demo", + "added_by": "Original Authors", + "logo": "Luodian_logo.png" + }, + { + "name": "peft", + "description": "A library for doing parameter efficient finetuning", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/huggingface/peft", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "huggingface_logo.png" + }, + { + "name": "trl", + "description": "A library for doing RLHF on LLMs.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/huggingface/trl", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "huggingface_logo.png" + }, + { + "name": "trlX", + "description": "A library for doing RLHF on LLMs.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Finetuning Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "https://aclanthology.org/2023.emnlp-main.530/", + "website_link": "https://trlx.readthedocs.io/en/latest/", + "github_link": "https://github.com/CarperAI/trlx", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "CarperAI_logo.png" + }, + { + "name": "Levanter", + "description": "Levanter is a framework for training large language models (LLMs) and other foundation models that strives for legibility, scalability, and reproducibility:", + "modalities": [ + "Text" + ], + "categories": [ + "Finetuning Repositories", + "Pretraining Repositories" + ], + "date": "6-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://crfm.stanford.edu/2023/06/16/levanter-1_0-release.html", + "github_link": "https://github.com/stanford-crfm/levanter", + "huggingface_link": "https://huggingface.co/stanford-crfm", + "added_by": "Original Authors", + "logo": "stanford-crfm_logo.png" + }, + { + "name": "AI Licensing Can\u2019t Balance \u201cOpen\u201d with \u201cResponsible\u201d", + "description": "A blog post by an IP lawyer arguing against responsible use licensing", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "7-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://katedowninglaw.com/2023/07/13/ai-licensing-cant-balance-open-with-responsible/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "AI Pubs Open RAIL-M License", + "description": "Template for a responsible AI model license where the model is intended for research use. Use restrictions relate to discrimination, transparency, and violating the law", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "3-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.licenses.ai/ai-pubs-open-railm-vz1", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "AI2 ImpACT-LR License", + "description": "License for low risk AI artifacts (data and models) that allows for distribution of the artifact and its derivatives. Use restrictions include weapons development and military surveillance", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "7-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://allenai.org/licenses/impact-lr", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "AI2 ImpACT-MR License", + "description": "License for medium risk AI artifacts (data and models) that does not allows for distribution of the artifact but does allow for distribution of its derivatives. Use restrictions include weapons development and military surveillance", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "7-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://allenai.org/licenses/impact-mr", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Apache 2.0 License", + "description": "The most common open-source license for model weights", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "1-2004", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.apache.org/licenses/LICENSE-2.0", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Behavioral Use Licensing for Responsible AI", + "description": "A paper that provides a theoretical framework for licenses inteded for open models with use restrictions", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "6-2022", + "primary_link": "Paper", + "paper_link": "https://dl.acm.org/doi/10.1145/3531146.3533143", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "BigCode Open RAIL-M License", + "description": "Template for a responsible AI model license. Use restrictions include generation and dissemination of malware", + "modalities": [ + "Text" + ], + "categories": [ + "License Selection" + ], + "date": "5-2023", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement", + "added_by": "Original Authors" + }, + { + "name": "BigScience Open RAIL-M License", + "description": "Template for a responsible AI model license. Use restrictions include defamation, disinformation, and discrimination", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "8-2022", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://static1.squarespace.com/static/5c2a6d5c45776e85d1482a7e/t/6308bb4bba3a2a045b72a4b0/1661516619868/BigScience+Open+RAIL-M+License.pdf", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Choose an open source license", + "description": "A guide for choosing among open source licenses that includes general selection criteria and explanations for software licenses", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://choosealicense.com/", + "github_link": "https://github.com/github/choosealicense.com/tree/gh-pages", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "github_logo.png" + }, + { + "name": "Create Commons License Chooser", + "description": "A guide for choosing among Creative Commons licenses with an explanation of how they function", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://chooser-beta.creativecommons.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Legal Playbook For Natural Language Processing Researchers", + "description": "This playbook is a legal research resource for various activities related to data gathering, data governance, and disposition of an AI model available as a public resource.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "7-1905", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://bigscience.huggingface.co/blog/legal-playbook-for-natural-language-processing-researchers", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Licensing is neither feasible nor effective for addressing AI risks", + "description": "Argues that licensing is not the correct way to address risks with AI systems", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "6-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.aisnakeoil.com/p/licensing-is-neither-feasible-nor", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Open RAIL-S License", + "description": "Template for a responsible AI source code license. Use restrictions relate to surveillance, synthetic media, healthcare and the criminal legal system", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "11-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.licenses.ai/source-code-license", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Primer on AI2 ImpACT Licenses", + "description": "A post by AI2 describing when and why an organization should use a specific ImpACT license", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "7-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://allenai.org/impact-license", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "The Open Source Definition", + "description": "The definition of an \"open source\" license", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "2-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://opensource.org/osd/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "The Turning Way, Licensing", + "description": "A guide to reproducible research and licensing", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://the-turing-way.netlify.app/reproducible-research/licensing", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "What is Free Software?", + "description": "A philosophical argument for why free software licenses are important", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "License Selection" + ], + "date": "2-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.gnu.org/philosophy/free-sw.en.html", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Ecosystem Cards", + "description": "Ecosystem Graphs centralize information about models and their impact in the broader ecosystem. ", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Model Documentation" + ], + "date": "3-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2303.15772", + "website_link": "https://hai.stanford.edu/news/ecosystem-graphs-social-footprint-foundation-models", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Foundation Model Transparency Index", + "description": "An index to measure the transparency of a foundation model with respect to its inputs, development, and downstream uses or policies.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Model Documentation" + ], + "date": "10-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2310.12941", + "website_link": "https://crfm.stanford.edu/fmti/", + "github_link": "https://github.com/stanford-crfm/fmti", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "stanford-crfm_logo.png" + }, + { + "name": "Model Card Resources", + "description": "A release of several resources surrounding model cards, including templates and tools for easy documentation creation, and how these are frequently used in practice.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Model Documentation" + ], + "date": "12-2022", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://huggingface.co/blog/model-cards", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Model Cards", + "description": "A standard for reporting and documenting machine learning models, for promoting and easing transparent and open model development or reporting.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Model Documentation" + ], + "date": "10-2018", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/1810.03993", + "website_link": "https://huggingface.co/spaces/huggingface/Model_Cards_Writing_Tool", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Hugging Face ML Research Release Toolkit ", + "description": "A new researcher guide to releasing model or data resources, documenting the research and Hugging Face objects.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Model Documentation" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://docs.google.com/document/d/1EOxyZ11piIIRLDlhofX8nfnU0mHCU-TZ3EU4tx5g9aE/edit#heading=h.8zrjwmlee7ge", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "C4", + "description": "An English, cleaned version of Common Crawl's web crawl corpus (https://commoncrawl.org).", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "4-2019", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/1910.10683", + "website_link": "https://commoncrawl.org", + "github_link": "https://github.com/google-research/text-to-text-transfer-transformer#c4", + "huggingface_link": "https://huggingface.co/datasets/allenai/c4", + "added_by": "Original Authors", + "logo": "google-research_logo.png" + }, + { + "name": "Common Voice", + "description": "28k hours [as of 11/2023] of crowd-sourced read speech from 100+ languages", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "11-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://commonvoice.mozilla.org/en/datasets", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "CulturaX", + "description": "A pertaining dataset of 16T tokens, covering 167 languages, cleaned, deduplicated, and refined. Combines mC4 into 2020, with OSCAR project data up to 2023.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "9-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2309.09400", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/uonlp/CulturaX", + "added_by": "Original Authors" + }, + { + "name": "DataComp-1B and CommonPool-13B", + "description": "A large pool of 13B image-text pairs from CommonCrawl and a curated 1B subset", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "4-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2304.14108", + "website_link": "https://www.datacomp.ai/", + "github_link": "https://github.com/mlfoundations/datacomp", + "huggingface_link": "https://huggingface.co/datasets/mlfoundations/datacomp_1b", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "Dolma", + "description": "A pretraining dataset of 3 trillion tokens from a diverse mix of web content, academic publications, code, books, and encyclopedic materials.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "8-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2402.00159", + "website_link": "", + "github_link": "https://github.com/allenai/dolma", + "huggingface_link": "https://huggingface.co/datasets/allenai/dolma", + "added_by": "Original Authors", + "logo": "allenai_logo.png" + }, + { + "name": "GigaSpeech", + "description": "40k hours (10k transcribed) multi-domain English speech corpus", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "7-1905", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2106.06909", + "website_link": "", + "github_link": "https://github.com/SpeechColab/GigaSpeech", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "SpeechColab_logo.png" + }, + { + "name": "Golos", + "description": "1,240 hours of crowd-sourced Russian speech", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "6-2021", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2106.10161", + "website_link": "https://www.openslr.org/114/", + "github_link": "https://github.com/sberdevices/golos", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "sberdevices_logo.png" + }, + { + "name": "IndicCorp v2", + "description": "A multilingual pre-training corpus for 24 Indian languages", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "5-2023", + "primary_link": "GitHub", + "paper_link": "https://aclanthology.org/2023.acl-long.693/", + "website_link": "", + "github_link": "https://github.com/AI4Bharat/IndicBERT/tree/main#indiccorp-v2", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "AI4Bharat_logo.png" + }, + { + "name": "IndicSUPERB", + "description": "1,684 hour crowd-sourced corpus of 12 Indian languages", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "8-2022", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2208.11761", + "website_link": "https://ai4bharat.iitm.ac.in/indicsuperb/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Libri-Light", + "description": "60k hour read English speech from LibriVox audiobooks", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "12-2019", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/1912.07875", + "website_link": "", + "github_link": "https://github.com/facebookresearch/libri-light", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "facebookresearch_logo.png" + }, + { + "name": "LibriSpeech", + "description": "960 hour read English speech from LibriVox audiobooks", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "7-1905", + "primary_link": "Webpage", + "paper_link": "http://www.danielpovey.com/files/2015_icassp_librispeech.pdf", + "website_link": "https://www.openslr.org/12/", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/librispeech_asr", + "added_by": "Original Authors" + }, + { + "name": "MADLAD-400", + "description": "A manually audited, general domain 3T token monolingual dataset based on CommonCrawl, spanning 419 languages.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "9-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2309.04662", + "website_link": "", + "github_link": "https://github.com/google-research/google-research/tree/master/madlad_400", + "huggingface_link": "https://huggingface.co/datasets/allenai/MADLAD-400", + "added_by": "Original Authors", + "logo": "google-research_logo.png" + }, + { + "name": "mC4", + "description": "The fully multilingual, cleaned version of Common Crawl's web crawl corpus (https://commoncrawl.org).", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "4-2019", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/1910.10683", + "website_link": "https://commoncrawl.org", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/mc4", + "added_by": "Original Authors" + }, + { + "name": "MMC4", + "description": "Interleaved image-text data from Common Crawl (570M images, 43B tokens)", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "4-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2304.06939", + "website_link": "", + "github_link": "https://github.com/allenai/mmc4", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "allenai_logo.png" + }, + { + "name": "OBELICS", + "description": "Interleaved image-text data from Common Crawl (353 M images, 115B tokens)", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "6-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2306.16527", + "website_link": "https://huggingface.co/blog/idefics", + "github_link": "https://github.com/huggingface/OBELICS", + "huggingface_link": "https://huggingface.co/datasets/HuggingFaceM4/OBELICS", + "added_by": "Original Authors", + "logo": "huggingface_logo.png" + }, + { + "name": "OLC", + "description": "The Open License Corpus is a 228B token corpus of permissively-licensed, primarily English text data for pretraining.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "8-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2308.04430", + "website_link": "", + "github_link": "https://github.com/kernelmachine/silo-lm#download-data", + "huggingface_link": "https://huggingface.co/datasets/kernelmachine/open-license-corpus", + "added_by": "Original Authors", + "logo": "kernelmachine_logo.png" + }, + { + "name": "OpenWebMath", + "description": "A dataset containing the majority of the high-quality, mathematical text from the internet. It is filtered and extracted from over 200B HTML files on Common Crawl down to a set of 6.3 million documents containing a total of 14.7B tokens.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "10-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2310.06786", + "website_link": "", + "github_link": "https://github.com/keirp/OpenWebMath", + "huggingface_link": "https://huggingface.co/datasets/open-web-math/open-web-math", + "added_by": "Original Authors", + "logo": "keirp_logo.png" + }, + { + "name": "OPUS", + "description": "The Open Parallel Corpus is a massive collection of translated text pairs from the web.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://opus.nlpl.eu/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "OSCAR", + "description": "The Open Super-large Crawled Aggregated coRpus provides web-based multilingual datasets across 166 languages.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "https://aclanthology.org/2022.wnut-1.23/", + "website_link": "https://oscar-project.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "peS2o", + "description": "A collection of ~40M creative open-access academic papers, cleaned, filtered, and formatted for pre-training of language models, originally derived from the Semantic Scholar Open Research Corpus (S2ORC).", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "1-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/1911.02782", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/allenai/peS2o", + "added_by": "Original Authors" + }, + { + "name": "Pile of Law", + "description": "An open-source, English dataset with \u223c256GB of legal and administrative data, covering court opinions, contracts, administrative rules, and legislative records.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "11-2022", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2207.00220", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/pile-of-law/pile-of-law", + "added_by": "Original Authors" + }, + { + "name": "RedPajama v2", + "description": "A pretraining dataset of 30 trillion filtered and deduplicated tokens (100+ trillions raw) from 84 CommonCrawl dumps covering 5 languages, along with 40+ pre-computed data quality annotations that can be used for further filtering and weighting.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "10-2023", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "https://www.together.ai/blog/redpajama-data-v2", + "github_link": "https://github.com/togethercomputer/RedPajama-Data", + "huggingface_link": "https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2", + "added_by": "Original Authors", + "logo": "togethercomputer_logo.png" + }, + { + "name": "ROOTS", + "description": "A massive multilingual pretraining corpus from BigScience, comprised of 1.6TB of text spanning 59 languages. It is a mix of OSCAR (https://oscar-project.org/) and the datasets found in the BigScience Catalogue (https://huggingface.co/spaces/bigscience/SourcingCatalog).", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "5-2022", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2303.03915", + "website_link": "https://bigscience.huggingface.co/", + "github_link": "https://github.com/bigscience-workshop/bigscience/tree/master/data", + "huggingface_link": "https://huggingface.co/bigscience-data", + "added_by": "Original Authors", + "logo": "bigscience-workshop_logo.png" + }, + { + "name": "Samr\u00f3mur", + "description": "2,200 hour crowd-sourced corpus of Icelandic speech", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "7-1905", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.openslr.org/128/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Shrutilipi", + "description": "6,400 hour corpus of TV/Radio broadcasts from 12 Indian languages", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "8-2022", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2208.12666", + "website_link": "https://ai4bharat.iitm.ac.in/shrutilipi/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "The People\u2019s Speech", + "description": "30k hour conversational English dataset", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "11-2021", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2111.09344", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/MLCommons/peoples_speech", + "added_by": "Original Authors" + }, + { + "name": "The Pile", + "description": "An 825GB English pretraining corpus that mixes portions of common crawl with 22 smaller, high-quality datasets combined together.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "12-2020", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2101.00027", + "website_link": "https://pile.eleuther.ai/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "The Proof Pile 2", + "description": "The Proof-Pile-2 is a 55 billion token dataset of mathematical and scientific documents.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "9-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2310.10631", + "website_link": "https://blog.eleuther.ai/llemma/", + "github_link": "https://github.com/EleutherAI/math-lm", + "huggingface_link": "https://huggingface.co/datasets/EleutherAI/proof-pile-2", + "added_by": "Original Authors", + "logo": "EleutherAI_logo.png" + }, + { + "name": "The RefinedWeb", + "description": "An English-only, web-only, deduplicated pretraining dataset of five trillion tokens.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "6-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2306.01116", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/tiiuae/falcon-refinedweb", + "added_by": "Original Authors" + }, + { + "name": "The Stack", + "description": "The Stack is a 6TB, permissively-licensed pretraining dataset from active GitHub repositories covering 358 programming languages.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "11-2022", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2211.15533", + "website_link": "https://www.bigcode-project.org/docs/about/the-stack/#datasets-and-data-governance-tools-released-by-bigcode", + "github_link": "https://github.com/bigcode-project/bigcode-dataset", + "huggingface_link": "https://huggingface.co/datasets/bigcode/the-stack", + "added_by": "Original Authors", + "logo": "bigcode-project_logo.png" + }, + { + "name": "VoxPopuli", + "description": "400k hours of unlabelled speech from 23 languages of the European parliament", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "1-2021", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2101.00390", + "website_link": "", + "github_link": "https://github.com/facebookresearch/voxpopuli", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "facebookresearch_logo.png" + }, + { + "name": "WebVid-10M", + "description": "10M videos with captions", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "4-2021", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2104.00650", + "website_link": "https://maxbain.com/webvid-dataset/", + "github_link": "https://github.com/m-bain/webvid", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "m-bain_logo.png" + }, + { + "name": "WenetSpeech", + "description": "22.4k hour multi-domain corpus of Mandarin", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "10-2021", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2110.03370", + "website_link": "https://www.openslr.org/121/", + "github_link": "https://github.com/wenet-e2e/WenetSpeech", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "wenet-e2e_logo.png" + }, + { + "name": "WURA", + "description": "A manually audited multilingual pre-training corpus (document-level dataset) for 16 African languages and four high-resource languages widely spoken in Africa (English, French, Arabic and Portuguese)", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "11-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://aclanthology.org/2023.emnlp-main.11/", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/castorini/wura", + "added_by": "Original Authors" + }, + { + "name": "WebDatasets", + "description": "A dataset format for high-performance streaming of data. Especially useful for modalities other than language that are more I/O intensive for training', such as images, video, or audio.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/webdataset/webdataset", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "webdataset_logo.png" + }, + { + "name": "Multi Legal Pile", + "description": "A large-scale multilingual legal dataset and superset of the Pile of Law, suited for pretraining language models. It spans over 24 languages and five legal text types.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "6-2023", + "primary_link": "Hugging Face object", + "paper_link": "https://arxiv.org/abs/2306.02069", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/joelniklaus/Multi_Legal_Pile", + "added_by": "Original Authors" + }, + { + "name": "GPT-NeoX", + "description": "A library for training large language models, built off Megatron-DeepSpeed and Megatron-LM with an easier user interface. Used at massive scale on a variety of clusters and hardware setups.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/EleutherAI/gpt-neox", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "EleutherAI_logo.png" + }, + { + "name": "Kosmos-2", + "description": "For training multimodal models with CLIP backbones.", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "6-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2306.14824", + "website_link": "", + "github_link": "https://github.com/microsoft/unilm/tree/master/kosmos-2", + "huggingface_link": "https://huggingface.co/spaces/ydshieh/Kosmos-2", + "added_by": "Original Authors", + "logo": "microsoft_logo.png" + }, + { + "name": "Lhotse", + "description": "Python library for handling speech data in machine learning projects", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "10-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://github.com/lhotse-speech/lhotse", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Megatron-DeepSpeed", + "description": "A library for training large language models, built off of Megatron-LM but extended by Microsoft to support features of their DeepSpeed library.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/microsoft/Megatron-DeepSpeed", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "microsoft_logo.png" + }, + { + "name": "Megatron-LM", + "description": "One of the earliest open-source pretraining codebases for large language models. Still updated and has been used for a number of landmark distributed training and parallelism research papers by NVIDIA.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/NVIDIA/Megatron-LM", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "NVIDIA_logo.png" + }, + { + "name": "OpenCLIP", + "description": "Supports training and inference for over 100 CLIP models", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "9-2021", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/mlfoundations/open_clip", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "OpenLM", + "description": "OpenLM is a minimal language modeling repository, aimed to facilitate research on medium sized LMs. They have verified the performance of OpenLM up to 7B parameters and 256 GPUs. They only depend only on PyTorch, XFormers, or Triton.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/mlfoundations/open_lm", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "mlfoundations_logo.png" + }, + { + "name": "Pytorch Image Models (timm)", + "description": "Hub for models, scripts and pre-trained weights for image classification models.", + "modalities": [ + "Vision" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "5-2019", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/huggingface/pytorch-image-models", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "huggingface_logo.png" + }, + { + "name": "Stable Audio Tools", + "description": "A codebase for distributed training of generative audio models.", + "modalities": [ + "Speech" + ], + "categories": [ + "Pretraining Repositories" + ], + "date": "Frequently Updated", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/Stability-AI/stable-audio-tools", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "Stability-AI_logo.png" + }, + { + "name": "Bias Benchmark for QA (BBQ)", + "description": "A dataset of question-sets constructed by the authors that highlight attested social biases against people belonging to protected classes along nine different social dimensions relevant for U.S. English-speaking contexts.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "10-2021", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2110.08193", + "website_link": "", + "github_link": "https://github.com/nyu-mll/BBQ", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "nyu-mll_logo.png" + }, + { + "name": "Crossmodal-3600", + "description": "Image captioning evaluation with geographically diverse images in 36 languages", + "modalities": [ + "Text", + "Vision" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "5-2022", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2205.12522", + "website_link": "", + "github_link": "https://google.github.io/crossmodal-3600/", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "FactualityPrompt", + "description": "A benchmark to measure factuality in language models.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "6-2022", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2206.04624", + "website_link": "", + "github_link": "https://github.com/nayeon7lee/FactualityPrompt", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "nayeon7lee_logo.png" + }, + { + "name": "From text to talk", + "description": "Harnessing conversational corpora for humane and diversity-aware language technology. They show how interactional data from 63 languages (26 families) harbours insights about turn-taking, timing, sequential structure and social action.", + "modalities": [ + "Speech" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "5-2022", + "primary_link": "Paper", + "paper_link": "https://aclanthology.org/2022.acl-long.385/ ", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Hallucinations", + "description": "Public LLM leaderboard computed using Vectara's Hallucination Evaluation Model. This evaluates how often an LLM introduces hallucinations when summarizing a document. ", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "10-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "https://github.com/vectara/hallucination-leaderboard", + "github_link": "", + "huggingface_link": "https://huggingface.co/vectara/hallucination_evaluation_model", + "added_by": "Original Authors" + }, + { + "name": "HolisticBias", + "description": "A bias and toxicity benchmark using templated sentences, covering nearly 600 descriptor terms across 13 different demographic axes, for a total of 450k examples", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "10-2022", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2205.09209", + "website_link": "https://ai.meta.com/research/publications/im-sorry-to-hear-that-finding-new-biases-in-language-models-with-a-holistic-descriptor-dataset/", + "github_link": "https://github.com/facebookresearch/ResponsibleNLP/tree/main/holistic_bias", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "facebookresearch_logo.png" + }, + { + "name": "Purple Llama CyberSecEval", + "description": "A benchmark for coding assistants, measuring their propensity to generate insecure code and level of compliance when asked to assist in cyberattacks.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://ai.meta.com/research/publications/purple-llama-cyberseceval-a-benchmark-for-evaluating-the-cybersecurity-risks-of-large-language-models/", + "github_link": "https://github.com/facebookresearch/PurpleLlama/tree/main/CybersecurityBenchmarks", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "facebookresearch_logo.png" + }, + { + "name": "Purple Llama Guard", + "description": "A tool to identify and protect against malicious inputs to LLMs.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2312.06674", + "website_link": "https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/", + "github_link": "https://github.com/facebookresearch/PurpleLlama/tree/main/Llama-Guard", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "facebookresearch_logo.png" + }, + { + "name": "Racial disparities in automated speech recognition", + "description": "A discussion of racial disparities and inclusiveness in automated speech recognition.", + "modalities": [ + "Speech" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "3-2020", + "primary_link": "Paper", + "paper_link": "", + "website_link": "https://www.pnas.org/doi/10.1073/pnas.1915768117", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "RealToxicityPrompts", + "description": "A dataset of 100k sentence snippets from the web for researchers to further address the risk of neural toxic degeneration in models.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "9-2020", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2009.11462", + "website_link": "https://toxicdegeneration.allenai.org/", + "github_link": "https://github.com/allenai/real-toxicity-prompts", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "allenai_logo.png" + }, + { + "name": "Red Teaming LMs with LMs", + "description": "A method for using one language model to automatically find cases where a target LM behaves in a harmful way, by generating test cases (\"red teaming\")", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "2-2022", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2202.03286", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Safety evaluation repository", + "description": "A repository of safety evaluations, across all modalities and harms, as of late 2023. Useful for delving deeper if the following evaluations don't meet your needs.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "10-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://dpmd.ai/46CPd58", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "SimpleSafetyTests", + "description": "Small probe set (100 English text prompts) covering severe harms: child abuse, suicide, self-harm and eating disorders, scams and fraud, illegal items, and physical harm", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "11-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2311.08370", + "website_link": "", + "github_link": "https://github.com/bertiev/SimpleSafetyTests", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "bertiev_logo.png" + }, + { + "name": "SneakyPrompt", + "description": "Automated jailbreaking method to generate NSFW content even with models that have filters applied", + "modalities": [ + "Vision" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "5-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2305.12082", + "website_link": "", + "github_link": "https://github.com/Yuchen413/text2image_safety", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "Yuchen413_logo.png" + }, + { + "name": "StableBias", + "description": "Bias testing benchmark for Image to Text models, based on gender-occupation associations.", + "modalities": [ + "Vision" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "3-2023", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "https://arxiv.org/abs/2303.11408", + "github_link": "", + "huggingface_link": "https://huggingface.co/spaces/society-ethics/StableBias", + "added_by": "Original Authors" + }, + { + "name": "Cerebras Model Lab", + "description": "A calculator to apply compute-optimal scaling laws for a given budget, including factoring expected total inference usage.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Efficiency & Resource Allocation" + ], + "date": "5-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://www.cerebras.net/model-lab/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "QLoRa", + "description": "An efficient finetuning approach that reduces memory usage while training.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Efficiency & Resource Allocation" + ], + "date": "5-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2305.14314", + "website_link": "", + "github_link": "https://github.com/artidoro/qlora", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "artidoro_logo.png" + }, + { + "name": "Scaling Data-Constrained Language Models", + "description": "Demonstrates an optimal allocation of compute when dataset size is bounded", + "modalities": [ + "Text" + ], + "categories": [ + "Efficiency & Resource Allocation" + ], + "date": "5-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2305.16264", + "website_link": "", + "github_link": "https://github.com/huggingface/datablations", + "huggingface_link": "https://huggingface.co/datablations", + "added_by": "Original Authors", + "logo": "huggingface_logo.png" + }, + { + "name": "Training Compute-Optimal Language Models", + "description": "Proposes an optimal allocation of computational budget between model and dataset size, and shows experimental design for fitting scaling laws for compute allocation in a new setting.", + "modalities": [ + "Text" + ], + "categories": [ + "Efficiency & Resource Allocation" + ], + "date": "3-2022", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2203.15556", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "AI Incident Database", + "description": "A database of harmful incidents tied to AI systems where developers or users can submit incident reports", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://incidentdatabase.ai/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "BigScience Ethical Charter", + "description": "Outlines BigScience's core values and how they promote them, which in turn guides use restrictions and communicates acceptable usage to users", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "6-2022", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://bigscience.huggingface.co/blog/bigscience-ethical-charter", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Llama 2 Responsible Use Guide", + "description": "Guidance for downstream developers on how to responsibly build with Llama 2. Includes details on how to report issues and instructions related to red-teaming and RLHF", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://ai.meta.com/llama/responsible-use-guide/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Model Gating from Hugging Face", + "description": "A resource describing how to require user credentials for model access, which may be appropriate for models trained for topics such as hate speech", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://huggingface.co/docs/hub/models-gated", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Model Monitoring in Practice Tutorial", + "description": "A tutorial given at FAccT and other venues describing how and why to monitor ML models. Includes a presentation on using transformer models to monitor for error detection", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "6-2022", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://sites.google.com/view/model-monitoring-tutorial", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Robust Invisible Video Watermarking with Attention", + "description": "A widely used watermark for video models ", + "modalities": [ + "Vision" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "9-2029", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/1909.01285", + "website_link": "", + "github_link": "https://github.com/DAI-Lab/RivaGAN", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "DAI-Lab_logo.png" + }, + { + "name": "Robust Distortion-free Watermarks for Language Models", + "description": "A watermark for autoregressive language models", + "modalities": [ + "Text" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "7-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/abs/2307.15593", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "A Holistic Approach to Undesired Content Detection in the Real World", + "description": "Description of five primary categories (Sexual, Hateful, Violent, Self-harm, Harassment) with sub-categories (e.g. Sexual / sexual content involving minors). Also describes a moderation filter (the OpenAI moderation endpoint), and releases a dataset labelled for the categories.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "2-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2208.03274.pdf", + "website_link": "", + "github_link": "https://github.com/openai/moderation-api-release", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "openai_logo.png" + }, + { + "name": "Perspective API", + "description": "Perspective API for content moderation. It has three classes of categories, each with 6+ attributes. (1) Production (Toxicity, Severe Toxicity, Identity Attack, Insult, Profanity, and Threats), (2) Experimental (Toxicity, Severe Toxicity, Identity Attack, Insult, Profanity, Threat, Sexually Explicit, and Flirtation), (3) NY Times (Attack on author, Attack on commenter, Incoherent, Inflammatory, Likely to Reject, Obscene, Spam, Unsubstantial).", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "8-2022", + "primary_link": "Paper", + "paper_link": "https://dl.acm.org/doi/pdf/10.1145/3534678.3539147", + "website_link": "https://developers.perspectiveapi.com/s/about-the-api-attributes-and-languages?language=en_US", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Mistral in-context self-reflection safety prompt", + "description": "Self-reflection prompt for use as a content moderation filter. It returns a binary value (safe/not) with 13 subcategories: Illegal, Child abuse, Hate Violence Harassment, Malware, Physical Harm, Economic Harm, Fraud, Adult, Political campaigning or lobbying, Privacy invasion, Unqualified law advice, Unqualified financial advice, Unqualified health advice", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "10-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2310.06825.pdf", + "website_link": "https://www.promptingguide.ai/models/mistral-7b", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Google, Gemini API Safety Filters (via Vertex)", + "description": "Safety filter for Gemini models, available through Vertex. 4 safety attributes are described: Hate speech, Harassment, Sexually Explicit, and Dangerous Content. Probabilities are returned for each attribute (Negligible, Low, Medium, High). ", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://cloud.google.com/vertex-ai/docs/generative-ai/multimodal/configure-safety-attributes", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Google, PaLM API Safety Filters (via Vertex)", + "description": "Safety filter for PaLM models, available through Vertex. 16 safety attributes are described (some of which are 'topical' rather than purely safety risks): Derogatory, Toxic, Violent, Sexual, Insult, Profanity, Death Harm & Tragedy, Firearms & Weapons, Public safety, Health, Religion & belief, Illicit drugs, War & conflict, Politics, Finance, Legal.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "3-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://cloud.google.com/vertex-ai/docs/generative-ai/configure-safety-attributes-palm", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "ActiveFence's LLM Safety Review: Benchmarks and Analysis", + "description": "Description of 4 risk categories, as part of a benchmark review of LLM safety: (1) Hate, (2) Misinformation, (3) Self-harm & Suicide, (4) Child abuse & exploitation.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "07-2023", + "primary_link": "Paper", + "paper_link": "https://www.activefence.com/LLMSafety", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Anthropic content moderation prompt", + "description": "In-context prompt for assessing whether messages and responses contain inappropriate content: \"violent, illegal or pornographic activities\"", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://docs.anthropic.com/claude/docs/content-moderation", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned", + "description": "Description of 20 risk areas, as part of red teaming Anthropics' models. Two of the tags are not interpretable (\"Other\" and \"N/A - Invalid attempt\"): Discrimination & justice, Hate speech & offensive language, Violence & incitement, Non-violent unethical behaviour (e.g. lying, cheating), Bullying & harassment, Other, Theft, N/A - Invalid attempt, Soliciting personally identifiable information, Conspiracy theories & misinformation, Substance abuse & banned substances, Fraud & deception, Weapons, Adult content, Property crime & vandalism, Animal abuse, Terrorism & organized crime, Sexual exploitation & human trafficking, Self-harm, Child abuse.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "10-2022", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2209.07858.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "BEAVERTAILS: Towards Improved Safety Alignment of LLM via a Human-Preference Dataset", + "description": "Description of 14 risk areas, as part of a QA dataset for aligning models and evaluating their safety: Hate Speech, Offensive Language, Discrimination, Stereotype, Injustice, Violence, Aiding and Abetting, Incitement, Financial Crime, Property Crime, Theft, Privacy Violation, Drug Abuse, Weapons, Banned Substance, Non-Violent Unethical Behavior, Sexually Explicit, Adult Content, Controversial Topics, Politics, Misinformation Re. ethics, laws and safety, Terrorism, Organized Crime, Self-Harm, Animal Abuse, Child Abuse", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "10-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2307.04657.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Safety Assessment of Chinese Large Language Models", + "description": "Description of 8 risk areas (called \"safety scenarios)\": Insult, Unfairness and Discrimination, Crimes and Illegal Activities, Sensitive Topics, Physical Harm, Mental health, Privacy and Property, Ethics and Morality. Six \"instruction attacks\" are also described: Goal hijacking, Prompt leaking, RolePlay Instruction, Unsafe Instruction Topic, Inquiry with Unsafe Opinion, Reverse Exposure.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "4-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2304.10436.pdf", + "website_link": "", + "github_link": "https://github.com/thu-coai/Safety-Prompts", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "thu-coai_logo.png" + }, + { + "name": "DECODINGTRUST: A Comprehensive Assessment of Trustworthiness in GPT Models", + "description": "Description of 8 evaluation areas: toxicity, stereotypes bias, adversarial robustness, out-of-distribution robustness, robustness against adversarial demonstrations, privacy, machine ethics, fairness.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "1-2024", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2306.11698.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "A Unified Typology of Harmful Content", + "description": "Taxonomy of harmful online content. There are 4 primary categories, which each have subcategories: (1) Hate and harassment (Doxxing, Identity attack, Identity misrepresentation, Insult, Sexual aggression, Threat of violence; (2) Self-inflicted harm (Eating disorder promotion, self-harm), (3) Ideological harm (Extremism Terrorism & Organized crime, Misinformation), (4) Exploitation (Adult sexual services, Child sexual abuse material, Scams).", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "Taxonomy of harmful online content. There are 4 primary categories, which each have subcategories: (1) Hate and harassment (Doxxing, Identity attack, Identity misrepresentation, Insult, Sexual aggression, Threat of violence; (2) Self-inflicted harm (Eating disorder promotion, self-harm), (3) Ideological harm (Extremism Terrorism & Organized crime, Misinformation), (4) Exploitation (Adult sexual services, Child sexual abuse material, Scams).", + "primary_link": "Paper", + "paper_link": "https://aclanthology.org/2020.alw-1.16.pdf", + "website_link": "https://docs.cohere.com/docs/content-moderation-with-classify", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Towards Safer Generative Language Models: A Survey on Safety Risks, Evaluations, and Improvements", + "description": "Description of 7 risk areas, as part of a survey on LLM risks: Toxicity and Abusive Content, Unfairness and Discrimination, Ethics and Morality Issues, Controversial Opinions, Misleading Information, Privacy and Data Leakage, Malicious Use and Unleashing AI Agents.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "11-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2302.09270.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Llama 2: Open Foundation and Fine-Tuned Chat Models", + "description": "Description of 3 risk areas, as part of the safety checks for releasing Llama2: (1) illicit and criminal activities (terrorism, theft, huam trafficking), (2) hateful and harmful activities (defamation, self-harm, eating disorders, discrimination), and (3) unqualified advice (medical, financial and legal advice). Other risk categories are described as part of red teaming and soliciting feedback.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "7-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2307.09288.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Ethical and social risks of harm from Language Models", + "description": "Two-tier taxonomy of risks, comprising both classification groups (of which there are 6) and associated harms (3 or 4 for each classification group). The classification groups are: (1) Discrimination, Exclusion and Toxicity, (2) Information Hazards, (3) Misinformation Harms, (4) Malicious Uses, (5) Human-Computer Interaction Harms, and (6) Automation, access, and environmental harms.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "12-2021", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2112.04359.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Sociotechnical Safety Evaluation of Generative AI Systems", + "description": "Two-tier taxonomy of risks, comprising both classification groups (of which there are 6) and associated harms (3 or 4 for each classification group). The classification groups are: (1) Representation and Toxicity Harms, (2) Misinformation Harms, (3) Information & Society Harms, (4) Malicious Use, (5) Human Autonomy & Integrity Harms, and (6) Socioeconomic & Environmental Harms.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "10-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2310.11986.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language Models' Alignment", + "description": "Two-tier taxonomy of risks, with seven major categories of LLM trustworthiness, each of which has several associated sub-categories: (1) Reliability, (2) Safety, (3) Fairness, (4) Resistance to Misuse, (5) Explainability and Reasoning, (6) Social Norms, and (7) Robustness.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "8-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2308.05374.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Process for Adapting Language Models to Society (PALMS) with Values-Targeted Datasets", + "description": "Description of 8 risk areas, as part of describing methods for aligning models: (1) Abuse, Violence and Threat (inclusive of self-harm), (2) Health (phyiscal and mental), (3) Human characteristics and behaviour, (4) Injustice and inequality (incl, discrimination, harmful stereotypes), (5) Political opinion and destabilization, (6) Relationships (romantic, familial friendships), (7) Sexual activity (inclusive of pornography), (8) Terrorism (inclusive of white supremacy).", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "6-2021", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2106.10328.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Sociotechnical Harms of Algorithmic Systems: Scoping a Taxonomy for Harm Reduction", + "description": "Description of 5 categories of harm, with detailed subcategories: (1) Representational harms, (2) Allocative harms, (3) Quality of Service harms, (4) Interpersonal harms, and (5) Social system harms. ", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "7-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2210.05791.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Deepfakes, Phrenology, Surveillance, and More! A Taxonomy of AI Privacy Risks", + "description": "Taxonomy of 12 privacy risks, based on reviewing 321 privacy-related incidents, filtered from the AI, Algorithmic and Automation Incident and Controversy Repository (AIAAIC) Database. Risks are split into those that are created by AI (Identification, Distortion, Exposure, Aggregation, Phrenology/Physiognomy) and those that are exacerbated by AI (Intrusion, Surveillance, Exclusion, Secondary Use, Insecurity, Increased Accessibility).", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "10-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2310.07879.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "The Ethical Implications of Generative Audio Models: A Systematic Literature Review", + "description": "Taxonomy of 12 \"negative broader impacts\" from generative models involving speech and music.", + "modalities": [ + "Speech" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "7-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2307.05527.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "An Overview of Catastrophic AI Risks", + "description": "Taxonomy of 4 catastrophic AI risks, with subcategories: (1) Malicious use (Bioterrrorism, Uncontrolled AI agents, AI capabilities for propaganda, Censorship and surveillance), (2) AI race (Autonomous weapons, Cyberwarfare, Automated human labour [mass unemployment and dependence on AI systems], (3) Organizational risks (AI accidentally leaked/stolen), (4) Rogue AIs (Proxy gaming, Goal drift, Power-seeking, Deception).", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "9-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2306.12001.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "The Malicious Use of Artificial Intelligence: Forecasting, Prevention, and Mitigation", + "description": "Taxonomy of 3 AI security risks, with subcategories: (1) Digital Security, Physical Security, Political Security.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "2-2018", + "primary_link": "Paper", + "paper_link": "https://img1.wsimg.com/blobby/go/3d82daa4-97fe-4096-9c6b-376b92c619de/downloads/MaliciousUseofAI.pdf?ver=1553030594217", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Open-sourcing highly capable foundation models", + "description": "Description of risks from malicious use of AI: Influence operations, Surveillance and population control, Scamming and spear phishing, Cyber attacks, Biological and chemical weapons development. Some \"extreme risks\" are also described in the paper (e.g. disruption to key societal functions).", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "9-2023", + "primary_link": "Paper", + "paper_link": "https://cdn.governance.ai/Open-Sourcing_Highly_Capable_Foundation_Models_2023_GovAI.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "How Does Access Impact Risk? Assessing AI Foundation Model Risk Along a Gradient of Access ", + "description": "Description of risks from open-sourcing models, including five instances of malicious use: (1) Fraud and other crime schemes, (2) Undermining of social cohesion and democratic processes, (3) Human rights abuses, (4) Disruption of critical infrastructure, and (5) State conflict. ", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "12-2023", + "primary_link": "Paper", + "paper_link": "https://securityandtechnology.org/wp-content/uploads/2023/12/How-Does-Access-Impact-Risk-Assessing-AI-Foundation-Model-Risk-Along-A-Gradient-of-Access-Dec-2023.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "OpenAI Preparedness Framework (Beta)", + "description": "Description of 4 catastrophic AI risks: (1) Cybersecurity, (2) Chemical, Biological, Nuclear and Radiological (CBRN) threats, (3) Persuasion, and (4) Model autonomy. The paper also highlights the risk of \"unknown unknowns\".", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "12-2023", + "primary_link": "Paper", + "paper_link": "https://cdn.openai.com/openai-preparedness-framework-beta.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Anthropic's Responsible Scaling Policy", + "description": "Framework with four tiers of model capability, ffrom ASL-1 (smaller models) to ASL-4 (speculative), with increasing risk as models' capability increases. It also describes 4 catastrophic AI risks: (1) Misuse risks, (2) CBRN risks, (3) Cyber risks, and (4) Autonomy and replication risks.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "9-2023", + "primary_link": "Paper", + "paper_link": "https://www-files.anthropic.com/production/files/responsible-scaling-policy-1.0.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Model evaluation for extreme risks", + "description": "Framework of 9 dangerous capabilities of AI models: (1) Cyber-offense, (2) Deception, (3) Persuasion & manipulation, (4) Politial strategy, (5) Weapons acquisition, (6) Long-horizon planning, (7) AI development, (8) Situational awareness, (9) Self-proliferation.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "9-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2305.15324.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Frontier AI Regulation: Managing Emerging Risks to Public Safety", + "description": "Description of \"sufficiently dangerous capabilities\" of AI models to cause serious harm and disruption on a global scale, such as synthesing new biological or chemical weapons and evading human control through means of deception and obfuscation.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "11-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2307.03718.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "The Fallacy of AI Functionality", + "description": "Taxonomy of four AI failure points: (1) Impossible tasks (either Conceptually impossible or Practically impossible), (2) Engineering failures (Design failures, Implementation failures, Missing Safety Features), (3) Post-Deployment Failures (Robustness Issues, Failure under Adversarial Attacks, Unanticipated Intractions, (4) Communication Failures (Falsified or Overstated Capabilities, Misrepresented Capabilities).", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "7-2022", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2206.09511.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "TASRA: a Taxonomy and Analysis of Societal-Scale Risks from AI", + "description": "Framework of 3 potential harms from AI: (1) Harm to people (individual harm, Group/community harm, Societal harm), (2) Harm to an Organisation or Enterprise, (3) Harm to a system. ", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Taxonomies" + ], + "date": "6-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2306.06924.pdf", + "website_link": "", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Cohere in-context content moderation prompt", + "description": "Few-shot prompt for classifying whether text is toxic or not.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "12-2023", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://docs.cohere.com/reference/toxicity-detection", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "NVidia NeMo Guardrails", + "description": "Open-source tooling to create guardrails for LLM applications.", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "4-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2310.10501.pdf", + "website_link": "https://blogs.nvidia.com/blog/ai-chatbot-guardrails-nemo/", + "github_link": "https://github.com/NVIDIA/NeMo-Guardrails", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "NVIDIA_logo.png" + }, + { + "name": "SafetyPrompts", + "description": "Open repository of datasets for LLM safety", + "modalities": [ + "Text" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "1-2024", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://safetyprompts.com/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "Model Risk Cards", + "description": "A framework for structured assessment and documentation of risks associated with an application of language models. Each RiskCard makes clear the routes for the risk to manifest harm, their placement in harm taxonomies, and example prompt-output pairs. The paper also describes 70+ risks identified from a literature survey.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Risks & Harms Evaluation" + ], + "date": "3-2023", + "primary_link": "Paper", + "paper_link": "https://arxiv.org/pdf/2303.18190.pdf", + "website_link": "", + "github_link": "https://github.com/leondz/lm_risk_cards", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "leondz_logo.png" + }, + { + "name": "Aya Dataset", + "description": "A permissively licensed multilingual instruction finetuning dataset curated by the Aya Annotation Platform from Cohere For AI. The dataset contains a total of 204k human-annotated prompt-completion pairs along with the demographics data of the annotators, spanning 65 languages.", + "modalities": [ + "Text" + ], + "categories": [ + "Finetuning Data Catalogs" + ], + "date": "2-2024", + "primary_link": "Webpage", + "paper_link": "https://arxiv.org/abs/2402.06619", + "website_link": "https://cohere.com/research/aya", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/CohereForAI/aya_dataset", + "added_by": "Original Authors" + }, + { + "name": "HuggingFace Provenance, Watermarking & Deepfake Detection Collection", + "description": "A collection of resources on provenance, watermarking & deepfake detection tools, that are used to assess the outputs of foundation models.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "2-2024", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/collections/society-ethics/provenance-watermarking-and-deepfake-detection-65c6792b0831983147bb7578", + "added_by": "Original Authors" + }, + { + "name": "SIB-200", + "description": "A large-scale open-sourced benchmark dataset for topic classification in 200 languages and dialects.", + "modalities": [ + "Text" + ], + "categories": [ + "Capabilities" + ], + "date": "9-2023", + "primary_link": "GitHub", + "paper_link": "https://arxiv.org/abs/2309.07445", + "website_link": "", + "github_link": "https://github.com/dadelani/sib-200", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "dadelani_logo.png" + }, + { + "name": "French-PD-Newpapers", + "description": "Nearly three million unique newspaper and periodical editions (70B words) from the French National Library.", + "modalities": [ + "Text" + ], + "categories": [ + "Pretraining Data Sources" + ], + "date": "1-2024", + "primary_link": "Hugging Face object", + "paper_link": "", + "website_link": "", + "github_link": "", + "huggingface_link": "https://huggingface.co/datasets/PleIAs/French-PD-Newspapers", + "added_by": "Original Authors" + }, + { + "name": "Datatrove", + "description": "A library to process, filter and deduplicate text data at a very large scale", + "modalities": [ + "Text" + ], + "categories": [ + "Data Cleaning" + ], + "date": "12-2023", + "primary_link": "GitHub", + "paper_link": "", + "website_link": "", + "github_link": "https://github.com/huggingface/datatrove", + "huggingface_link": "", + "added_by": "Original Authors", + "logo": "huggingface_logo.png" + }, + { + "name": "Nomic", + "description": "A proprietary service to explore data with embedding maps.", + "modalities": [ + "Text" + ], + "categories": [ + "Data Exploration" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://home.nomic.ai/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + }, + { + "name": "AI Vulnerability Database", + "description": "An open-source, extensible knowledge base of AI failures.", + "modalities": [ + "Text", + "Speech", + "Vision" + ], + "categories": [ + "Usage Monitoring" + ], + "date": "Frequently Updated", + "primary_link": "Webpage", + "paper_link": "", + "website_link": "https://avidml.org/", + "github_link": "", + "huggingface_link": "", + "added_by": "Original Authors" + } +] diff --git a/app/assets/scss/custom.scss b/app/assets/scss/custom.scss new file mode 100755 index 0000000..5d4d229 --- /dev/null +++ b/app/assets/scss/custom.scss @@ -0,0 +1,277 @@ +html, +body { + scroll-padding-top: 120px; + a { + word-break: break-word; + } +} + +.container .social-icons li { + padding: 0; + margin: 0; +} + +.header-solid { + //background-color: #6fa8dc; +} + +.dark .header-solid { + background-color: #03151f; + +} +.header { + padding-top: .5rem !important; + padding-bottom: .5rem !important; + box-shadow:0 6px 14px rgba(0, 0, 0, 0.27); /* Slight drop shadow */ + backdrop-filter: blur(20px) invert(0%); +} +.header, footer .tail { + background-color: #eff3f754 !important +} +.dark .header, .dark footer .tail { + background-color: #008391bf !important; +} + +html { + scroll-behavior: smooth; +} + +h2 { + font-size: 1.6rem !important; + font-weight: 550; +} + +h3 { + font-size: 1.4rem !important; + font-weight: 550; +} +.header { + .social-icons li a { + font-size: 1.1em !important; + height: 2rem; + width: 2rem; + } +} + +address { + display: inline; + font-style: normal; +} +.katex-display { + color: #000; +} + +.dark .katex-display { + color: #fff; +} + +#biblio pre { + background-color: #e1e9e8 !important; + max-height: 300px +} +.dark #biblio pre { + background-color: #1c1d1d !important; +} +#biblio pre code span span a { + display: none !important; +} + +.widget a { + font-size: 2em !important; +} + +.content { + :where(blockquote p:first-of-type):not( + :where([class~="not-prose"] *) + )::before { + content: none !important; + } + + ul li, + article .content ul li { + //font-size: 1.2em !important; + } + + :is(:where(p):not(:where([class~="not-prose"] *))) { + font-size: 20px !important; + } + + :where(figcaption):not(:where([class~="not-prose"] *)) { + font-size: 1.05em; + line-height: 1.4285714; + margin-top: 0.471429em; + } + + .social-icons img { + margin-bottom: 2em; + } + .social-icons a { + text-decoration: none; + } +} + +sup a { + text-decoration: none !important; +} + +.toc { + position: sticky; + top: 8rem; + align-self: start; +} + +#tableOfContentContainer { + max-height: 40vh; /* Adjust the percentage as needed */ + overflow-y: auto; +} + +nav#TableOfContents { + background-color: transparent !important; + + li.active a { + color: #022b4a !important; + text-decoration: none !important; + font-weight: 550 !important; + } + + ol ol li { + margin-left: 10px; + margin-bottom: 10px; + } +} + +.card-title { + font-size: 1rem; + font-weight: 550; +} + +.footer-menu-item { + margin-bottom: 10px; + font-weight: 550; + font-size: 1.3em; + line-height: 2em; +} + +.footer-menu-item p, +.footer-menu-item > a { + text-decoration: underline; +} + +.footer-menu-subitem { + display: block; + font-weight: 400; + font-size: 0.9em; +} + +.dark .faq__answer { + color: #fff; +} + +.social-icons li a, +.share-icon svg { + font-size: 1.6em !important; +} + +section:nth-of-type(2) { + padding-top: 2rem !important; +} + +.nav-dropdown-list { + min-width: 350px; + background-color: #FFF; +} + + +.accordion.active .accordion-content { + max-height: 100%; +} + +a { + overflow-wrap: break-word; +} + +code { + overflow-x: auto; + white-space: pre-wrap; + //background: #fff; +} + +pre { + overflow-x: auto; + white-space: pre-wrap; + //background: #424242 !important; + //color: #fff; +} + +@media (min-width: 1024px) { + .lg\:content :is(:where(code):not(:where([class~="not-prose"] *))) { + padding-left: 0.25rem; + padding-right: 0.25rem; + --tw-text-opacity: 1; + color: #000000 !important; + } +} + + +.readable { + font-size: 20px; +} + +:is(.dark .content) :is(:where(a):not(:where([class~="not-prose"] *))) { + --tw-text-opacity: 1; + color: rgb(242, 237, 242); +} + +.accordion { + border: 0 !important; +} + +.dark { + .nav-dropdown-list { + background-color: #000; + } + + code { + //background-color: #000; + color: #ffffff !important; + } + + .content :where(figcaption):not(:where([class~="not-prose"] *)) { + color: #fff; + } + + .lg\:content :where(figcaption):not(:where([class~="not-prose"] *)) { + color: #fff; + } + + a.btn-primary { + color: #000; + } + .content :where(ol > li):not(:where([class~="not-prose"] *))::marker { + font-weight: 400; + color: #fff; + } +} + +@media (min-width: 240px) { + .container { + max-width: 100%; + } +} +@media (min-width: 540px) { + .container { + max-width: 100%; + } +} + +@media (min-width: 768px) { + .container { + max-width: 100%; + } +} + +@media (min-width: 1280px) { + .container { + max-width: 1280px; + } +} diff --git a/app/assets/sections.json b/app/assets/sections.json new file mode 100644 index 0000000..1847161 --- /dev/null +++ b/app/assets/sections.json @@ -0,0 +1,79 @@ +{ + "sections": [ + { + "number": 1, + "name": "Data Sources", + "items": [ + "Pretraining data provides the fundamental ingredient to foundation models—including their capabilities and flaws. Finetuning data hones particular abilities of a model, or in the case of instruction finetuning or alignment training, improves the models general usability and helpfulness while reducing potential harms.", + "More data is not always better. It is essential to carefully source data, and manually inspect it to ensure it fits the goals of your project.", + "Dataset selection includes many relevant considerations, such as language and dialect coverage, topics, tasks, diversity, quality, and representation.", + "Most datasets come with implicit modifications and augmentations, from their selection, filtering, and formatting. Pay attention to these pre-processing steps, as they will impact your model.", + "Finetuning data can hone some capabilities or impair others. Use catalogs to support an informed selection, and prefer well-documented to under-documented datasets.", + "The most appropriate datasets may not exist for a given set of tasks. Be aware of the limitations of choosing from what is available." + ] + }, + { + "number": 2, + "name": "Data Preparation", + "items": [ + "Tools for searching and analysing can help developers better understand their data, and therefore understand how their model will behave; an important, but often overlooked, step of model development.", + "Data cleaning and filtering can have an immense impact on the model characteristics, though there is not a one size fits all recommendation. The references provide filtering suggestions based on the application and communities the model is intended to serve.", + "When training a model on data from multiple sources/domains, the quantity of data seen from each domain (data mixing) can have a significant impact on downstream performance. It is common practice to upweight domains of “high-quality” data; data that is known to be written by humans and has likely gone through an editing process such as Wikipedia and books. However, data mixing is an active area of research and best practices are still being developed.", + "Removing duplicated data can reduce undesirable memorization and can improve training efficiency.", + "It is important to carefully decontaminate training datasets by removing data from evaluation benchmarks, so their capabilities can be precisely understood." + ] + }, + { + "number": 3, + "name": "Data Documentation and Release", + "items": [ + "Data documentation is essential for reproducibility, avoiding misuse, and helping downstream users build constructively on prior work.", + "We recommend to start the documentation process early, as data is collected and processed.", + "For datasets with multiple stakeholders, or derived from community efforts, it is important to appropriately proactively organize its access, licenses, and stewardship." + ] + }, + { + "number": 4, + "name": "Model Training", + "items": [ + "The foundation model life-cycle consists of several stages of training, broadly separated into pre-training and fine-tuning.", + "Decisions made by developers at any stage of training can have outsized effects on the field and the model’s positive and negative impacts, especially decisions made by well-resourced developers during the pre-training stage.", + "Developers should be thoughtful about the effects of train-time decisions and be aware of the trade-offs and potential downstream effects prior to training.", + "Due to the large economic and environmental costs incurred during model training, making appropriate use of training best practices and efficiency techniques is important in order to not waste computational or energy resources needlessly." + ] + }, + { + "number": 5, + "name": "Environmental Impact", + "items": [ + "Training and deploying AI models impacts the environment in several ways, from the rare earth minerals used for manufacturing GPUs to the water used for cooling datacenters and the greenhouse gasses (GHG) emitted by generating the energy needed to power training and inference.", + "Developers should report energy consumption and carbon emissions separately to enable an apples-to-apples comparisons of models trained using different energy sources.", + "It is important to estimate and report the environmental impact not just of the final training run, but also the many experiments, evaluation, and expected downstream uses.", + "It is recommended, especially for major model releases, to measure and report their environmental impact, such as carbon footprint, via mechanisms such as model cards (see Section 3)." + ] + }, + { + "number": 6, + "name": "Model Evaluation", + "items": [ + "Model evaluation is an essential component of machine learning research. However many machine learning papers use evaluations that are not reproducible or comparable to other work.", + "One of the biggest causes of irreproducibility is failure to report prompts and other essential components of evaluation protocols. This would not be a problem if researchers released evaluation code and exact prompts, but many prominent labs (OpenAI, Anthropic, Meta) have not done so for model releases. When using evaluation results from a paper that does not release its evaluation code, reproduce the evaluations using an evaluation codebase.", + "Examples of high-quality documentation practices for model evaluations can be found in Brown et al. (2020) (for bespoke evaluations) and Black et al. (2022); Scao et al. (2022); Biderman et al. (2023) (for evaluation using a public codebase).", + "Expect a released model to be used in unexpected ways. Accordingly, try to evaluate the model on benchmarks that are most related to its prescribed use case, but also its failure modes or potential misuses.", + "All evaluations come with limitations. Be careful to assess and communicate these limitations when reporting results, to avoid overconfidence in model capabilities." + ] + }, + { + "number": 7, + "name": "Model Release and Evaluation", + "items": [ + "Release models with accompanying, easy-to-run code for inference, and ideally training and evaluation.", + "Document models thoroughly to the extent possible. Model documentation is critical to avoiding misuse and harms, as well as enabling developers to effectively build on your work.", + "Open source is a technical term and standard with a widely accepted definition that is maintained by the Open Source Initiative (OSI) (Initiative, 2024). Not all models that are downloadable or that have publicly available weights and datasets are open-source; open-source models are those that are released under a license that adheres to the OSI standard.", + "The extent to which “responsible use licenses” are legally enforceable is unclear. While licenses that restrict end use of models may prevent commercial entities from engaging in out-of-scope uses, they are better viewed as tools for establishing norms rather than binding contracts.", + "Choosing the right license for an open-access model can be difficult. Apache 2.0 is the most common open-source license, while responsible AI licenses with use restrictions have seen growing adoption. For open-source licenses, there are several tools that are available to help developers select the right license for their artifacts.", + "Frameworks for monitoring and shaping model usage have become more prevalent as policymakers have attempted to constrain certain end uses of foundation models. Several approaches include adverse event reporting, watermarking, and restricting access to models in limited ways. Consider providing guidance to users on how to use your models responsibly and openly stating the norms you hope will shape model use." + ] + } + ] +} diff --git a/app/assets/source-assets/Layer.psd b/app/assets/source-assets/Layer.psd new file mode 100644 index 0000000..30e6cb2 Binary files /dev/null and b/app/assets/source-assets/Layer.psd differ diff --git a/app/assets/source-assets/New Project.psd b/app/assets/source-assets/New Project.psd new file mode 100644 index 0000000..b6bb291 Binary files /dev/null and b/app/assets/source-assets/New Project.psd differ diff --git a/app/assets/source-assets/card-template.psd b/app/assets/source-assets/card-template.psd new file mode 100644 index 0000000..88b4f3f Binary files /dev/null and b/app/assets/source-assets/card-template.psd differ diff --git a/app/assets/source-assets/foundation-training-flowchart-trans.png b/app/assets/source-assets/foundation-training-flowchart-trans.png new file mode 100644 index 0000000..9ae0aea Binary files /dev/null and b/app/assets/source-assets/foundation-training-flowchart-trans.png differ diff --git a/app/assets/source-assets/logo-icon.xcf b/app/assets/source-assets/logo-icon.xcf new file mode 100644 index 0000000..416e128 Binary files /dev/null and b/app/assets/source-assets/logo-icon.xcf differ diff --git a/app/config/_default/languages.toml b/app/config/_default/languages.toml new file mode 100755 index 0000000..f39ff9d --- /dev/null +++ b/app/config/_default/languages.toml @@ -0,0 +1,6 @@ +################ English language ################## +[en] +languageName = "En" +languageCode = "en-us" +contentDir = "content/english" +weight = 1 diff --git a/app/config/_default/menus.en.toml b/app/config/_default/menus.en.toml new file mode 100755 index 0000000..48d0c69 --- /dev/null +++ b/app/config/_default/menus.en.toml @@ -0,0 +1,237 @@ +############# English navigation ############## + +[[main]] +name = "Browse Resources" +url = "/foundation-model-resources/" +weight = 2 + +[[main]] +parent = "Browse Resources" +name = "Data Sources" +url = "/foundation-model-resources/#data-sources" +weight = 1 + +[[main]] +parent = "Browse Resources" +name = "Data Preparation" +url = "/foundation-model-resources/#data-preparation" +weight = 2 + +[[main]] +parent = "Browse Resources" +name = "Model Training" +url = "/foundation-model-resources/#model-training" +weight = 4 + +[[main]] +parent = "Browse Resources" +name = "Data Documentation & Release" +url = "/foundation-model-resources/#data-documentation-and-release" +weight = 3 + + + +[[main]] +parent = "Browse Resources" +name = "Environmental Impact" +url = "/foundation-model-resources/#environmental-impact" +weight = 5 + +[[main]] +parent = "Browse Resources" +name = "Model Evaluation" +url = "/foundation-model-resources/#model-evaluation" +weight = 6 + +[[main]] +parent = "Browse Resources" +name = "Model Release & Evaluation" +url = "/foundation-model-resources/#model-release-and-evaluation" +weight = 7 + + + + + +[[main]] +name = "Get Involved" + +[[main]] +parent = "Get Involved" +name = "Contribute" +url = "/contribute/" +weight = 1 + +[[main]] +parent = "Get Involved" +name = "View Contributors" +url = "/contributors/" +weight = 2 + + +# footer menu +[[footer]] +name = "Get Involved" +haschildren = true +weight = 1 + +[[footer]] +parent = "Get Involved" +name = "Contribute" +url = "/contribute/" + +[[footer]] +parent = "Get Involved" +name = "Contributors" +url = "/contributors/" +# +#[[footer]] +#name = "Data Sources" +#weight = 2 +# +#[[footer]] +#parent = "Data Sources" +#name = "Pretraining Data Sources" +#url = "/foundation-model-resources/pretraining-data-sources/" +#weight = 1 +# +#[[footer]] +#parent = "Data Sources" +#name = "Fine-tuning Data Catalogs" +#url = "/foundation-model-resources/finetuning-data-catalogs/" +#weight = 2 +# +# +#[[footer]] +#name = "Data Preparation" +#weight = 3 +# +#[[footer]] +#parent = "Data Preparation" +#name = "Data Search Analysis Exploration" +#url = "/foundation-model-resources/data-search-analysis-exploration/" +#weight = 1 +# +#[[footer]] +#parent = "Data Preparation" +#name = "Data Cleaning, Filtering & Mixing" +#url = "/foundation-model-resources/data-cleaning-filtering-mixing/" +#weight = 2 +# +#[[footer]] +#parent = "Data Preparation" +#name = "Data Deduplication" +#url = "/foundation-model-resources/data-deduplication/" +#weogjt = 3 +# +#[[footer]] +#parent = "Data Preparation" +#name = "Data Decontamination" +#url = "/foundation-model-resources/data-decontamination/" +#weight = 4 +# +#[[footer]] +#parent = "Data Preparation" +#name = "Data Auditing" +#url = "/foundation-model-resources/data-auditing/" +#weight = 5 +# +#[[footer]] +#name = "Data Documentation and Release" +#weight = 4 +# +#[[footer]] +#parent = "Data Documentation and Release" +#name = "Data Documentation" +#url = "/foundation-model-resources/data-documentation/" +#weight = 1 +# +#[[footer]] +#parent = "Data Documentation and Release" +#name = "Data Governance" +#url = "/foundation-model-resources/data-governance/" +#weight = 2 +# +# +#[[footer_right]] +#name = "Model Training" +#weight = 1 +# +#[[footer_right]] +#parent = "Model Training" +#name = "Pretraining Repositories" +#url = "/foundation-model-resources/model-training-pretraining-repositories/" +#weight = 1 +# +#[[footer_right]] +#parent = "Model Training" +#name = "Finetuning Repositories" +#url = "/foundation-model-resources/model-training-finetuning-repositories/" +#weight = 2 +# +#[[footer_right]] +#parent = "Model Training" +#name = "Efficiency Resource Allocation" +#url = "/foundation-model-resources/model-training-efficiency-resource-allocation/" +#weight = 3 +# +#[[footer_right]] +#parent = "Model Training" +#name = "Educational Resources" +#url = "/foundation-model-resources/model-training-educational-resources/" +#weight = 4 +# +#[[footer_right]] +#name = "Environmental Impact" +#weight = 2 +# +#[[footer_right]] +#parent = "Environmental Impact" +#name = "Estimating Environmental Impact" +#url = "/foundation-model-resources/environmental-impact/" +#weight = 1 +# +#[[footer_right]] +#name = "Models Evaluation" +#weight = 3 +# +#[[footer_right]] +#parent = "Models Evaluation" +#name = "Capabilities" +#url = "/foundation-model-resources/model-evaluation-capabilities/" +# +#[[footer_right]] +#parent = "Models Evaluation" +#name = "Rosks & Harm Taxonomies" +#url = "/foundation-model-resources/model-evaluation-risks-harms-taxonomies/" +# +#[[footer_right]] +#parent = "Models Evaluation" +#name = "Model Evaluation Risks Harms" +#url = "/foundation-model-resources/model-evaluation-risks-harms/" +# +# +#[[footer_right]] +#name = "Model Release & Monitoring" +#weight = 4 +# +#[[footer_right]] +#parent = "Model Release & Monitoring" +#name = "Model Documentation" +#url = "/foundation-model-resources/model-documentation/" +# +#[[footer_right]] +#parent = "Model Release & Monitoring" +#name = "Reproducibility" +#url = "/foundation-model-resources/reproducibility/" +# +#[[footer_right]] +#parent = "Model Release & Monitoring" +#name = "License Selection" +#url = "/foundation-model-resources/license-selection/" +# +#[[footer_right]] +#parent = "Model Release & Monitoring" +#name = "Usage Monitoring" +#url = "/foundation-model-resources/usage-monitoring/" +# diff --git a/app/config/_default/module.toml b/app/config/_default/module.toml new file mode 100644 index 0000000..88a4d9f --- /dev/null +++ b/app/config/_default/module.toml @@ -0,0 +1,93 @@ +[hugoVersion] +extended = true +min = "0.115.2" + +# [[imports]] +# path = "github.com/zeon-studio/hugoplate" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/search" + +[[imports]] +#path = "github.com/gethugothemes/hugo-modules/pwa" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/images" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/videos" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/icons/font-awesome" + +# [[imports]] +# path = "github.com/gethugothemes/hugo-modules/icons/themify-icons" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/gzip-caching" + +[[imports]] +#path = "github.com/gethugothemes/hugo-modules/adsense" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/accordion" + +[[imports]] +#path = "github.com/gethugothemes/hugo-modules/table-of-contents" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/tab" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/modal" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/gallery-slider" + +[[imports]] +#path = "github.com/gethugothemes/hugo-modules/components/preloader" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/components/social-share" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/components/cookie-consent" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/components/custom-script" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/components/render-link" + +# [[imports]] +# path = "github.com/gethugothemes/hugo-modules/components/valine-comment" + +# [[imports]] +# path = "github.com/gethugothemes/hugo-modules/components/crisp-chat" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/shortcodes/button" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/shortcodes/notice" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/seo-tools/basic-seo" + +[[imports]] +path = "github.com/gethugothemes/hugo-modules/seo-tools/site-verifications" + +[[imports]] +#path = "github.com/gethugothemes/hugo-modules/seo-tools/google-tag-manager" + +# [[imports]] +# path = "github.com/gethugothemes/hugo-modules/seo-tools/baidu-analytics" + +# [[imports]] +# path = "github.com/gethugothemes/hugo-modules/seo-tools/matomo-analytics" + +# [[imports]] +# path = "github.com/gethugothemes/hugo-modules/seo-tools/plausible-analytics" + +# [[imports]] +# path = "github.com/gethugothemes/hugo-modules/seo-tools/counter-analytics" diff --git a/app/config/_default/params.toml b/app/config/_default/params.toml new file mode 100755 index 0000000..b0b22b6 --- /dev/null +++ b/app/config/_default/params.toml @@ -0,0 +1,99 @@ +#################### default parameters ################################ +# favicon +favicon = "images/favicon.png" +# logo +logo = "images/fmcheatsheet-logo-large.png" +logo_darkmode = "images/fmcheatsheet-logo-dark.png" +# use `px` or `x` with logo_width, example: "100px". +# Note: logo_width is not work with .svg file +logo_width = "371px" +logo_height = "25px" +# if logo_webp set false, will not generate WEBP version of logo | default is true +logo_webp = true +# logo text will only show when logo is missing. +logo_text = "Foundation Model Cheatsheet" +# navbar fixed to top +navbar_fixed = true +# theme-mode +theme_switcher = false +theme_default = "light" # available options [light/dark/system] +# Main Sections +mainSections = ["foundation-model-resources"] +# contact form action +contact_form_action = "/formHandle.php" # contact form works with [https://airform.io/] or [https://formspree.io] +# google tag manager, see https://developers.google.com/tag-manager/ +google_tag_manager = ""# "G-BE3HVN9469" # example: G-XXXXXXXXXX +google_adsense = "" # ca-pub-7563926680228471" # example: ca-pub-xxxxxxxxxxxxxxxx +# custom script on header, example: custom_script= "" +custom_script = "" +# copyright +copyright = 'Website hosted by the Allen Institute for AI | Design by AI Models for Open Source AI | Copyright © 2024' + + + +# Preloader +[preloader] +enable = false +preloader = "" # use jpg, png, svg or gif format. + +# Navigation button +[navigation_button] +enable = true +label = "Download Cheatsheet (.pdf)" +link = "foundation-model-development-cheatsheet.pdf" + +# search +[search] +enable = true +primary_color = "#121212" +include_sections = ["foundation-model-resources"] +show_image = true +show_description = true +show_tags = false +show_categories = false + + +# seo meta data for OpenGraph / Twitter Card +[metadata] +keywords = ["Foundation Models", "Open Source AI", "Artificial Intelligence"] +description = "Foundation Model Cheatsheet is a collection of resources for the AI community to learn about the latest advancements in AI and to contribute to the open source AI ecosystem." +image = "images/fmcheatsheet.png" + + +# site verifications +[site_verification] +google = "" # Your verification code +bing = "" # Your verification code +baidu = "" # Your verification code +facebook = "" # Your verification code +mastodon = "" # Your verification code + +# cookies module: https://github.com/gethugothemes/hugo-modules/tree/master/components/cookie-consent +[cookies] +enable = true +expire_days = 60 +content = "This site uses cookies. By continuing to use this website, you agree to their use." +button = "I Accept" + +######################## sidebar widgets ######################### +[widgets] +sidebar = ["categories", "tags"] +models = ["categories","communities"] + + +# google map +[google_map] +enable = false +map_api_key = "" +map_latitude = "51.5223477" +map_longitude = "-0.1622023" +map_marker = "images/marker.png" + + +# Subscription +[subscription] +enable = true +# mailchimp subsciption +mailchimp_form_action = "" # replace this url with yours +mailchimp_form_name = "" + diff --git a/app/content/404.md b/app/content/404.md new file mode 100644 index 0000000..7310215 --- /dev/null +++ b/app/content/404.md @@ -0,0 +1,11 @@ +--- +title: Whoops! Page not found +noindex: true +layout: page +--- + +# 404 + +That page can't be found. + +Our latest content is [on the homepage](/). diff --git a/app/content/english/contribute/index.md b/app/content/english/contribute/index.md new file mode 100644 index 0000000..6369f61 --- /dev/null +++ b/app/content/english/contribute/index.md @@ -0,0 +1,37 @@ +--- +title: "Contribute" +description: "Foundation Model Cheatsheet" +--- + +## Add to Cheatsheet + +To contribute resources to the cheatsheet, please review the Criteria for Inclusion below, and the Add Resource Instructions. + +### Criteria for Inclusion: + +The resources are selected based on a literature review for each phase of foundation model development. +Inclusion is predicated on: the **perceived helpfulness as a development tool**, the **extent and quality of the documentation**, +and the **insights brought to the development process**. + +Please ensure your candidate resource will meaningfully aid *responsible* development practices. +While we do accept academic literature as a resource, this cheatsheet focuses on tools, such as data catalogs, +search/analysis tools, evaluation repositories, and, selectively, literature that summarizes, surveys, or guides important development decisions. + +We will review suggested contributions and (optionally) acknowledge contributors to this cheatsheet on the website +and in future work. + +### Add Resource Instructions: + +* Option 1: Use this [upload form](https://forms.gle/aAa2dismSzCpae4p6) to contribute a resource. +* Option 2: Bulk upload resources by creating a pull request in this repository, extending `app/resources/resources.jsonl`. + +> In both cases, it is essential that the requested documentation on each resource is accurate and complete. + + +## Contact and Citation + +Contact [slongpre@media.mit.edu](mailto:slongpre@media.mit.edu) for questions about this resource. + +``` +Citation coming soon. +``` diff --git a/app/content/english/contributors/index.md b/app/content/english/contributors/index.md new file mode 100644 index 0000000..116f2b2 --- /dev/null +++ b/app/content/english/contributors/index.md @@ -0,0 +1,25 @@ +--- +title: "Contributors" +description: "To create this cheatsheet, a variety of contributors were asked to propose resources, papers, and tools relevant to open foundation model development." +--- + +To create this cheatsheet, a variety of contributors provided resources, papers, and tools relevant to open foundation model development. Resources were grouped and curated by a team with a focus on speech and vision modalities (led by Nay San and Gabriel Ilharco, respectively). + +**Curators (Alphabetical):** + +* **Data (Across Subcategories):** David Adelani, Stella Biderman, Gabriel Ilharco, Kyle Lo, Shayne Longpre, Luca Soldaini, Nay San +* **Data Cleaning, Filtering, & Mixing:** Alon Albalak, Kyle Lo, Luca Soldaini +* **Data Decontamination:** Stella Biderman, Shayne Longpre +* **Data Governance:** Stella Biderman, Yacine Jernite, Sayash Kapoor, +* **Efficiency & Resource Allocation:** Hailey Schoelkopf +* **Environmental Impact:** Peter Henderson, Sayash Kapoor, Sasha Luccioni +* **General Capabilities:** Rishi Bommasani, Shayne Longpre +* **License Selection:** Stella Biderman, Yacine Jernite, Kevin Klyman, Aviya Skowron, Daniel McDuff +* **Model Documentation:** Sayash Kapoor, Shayne Longpre +* **Pretraining Repositories:** Stella Biderman, Gabriel Ilharco, Nay San, Hailey Schoelkopf +* **Reproducibility:** Stella Biderman, Shayne Longpre +* **Risks & Harms:** Maribeth Rauh, Laura Weidinger, Bertie Vidgen +* **Usage Monitoring:** Kevin Klyman +* **Website:** Shayne Longpre, Luca Soldaini, Justin Riddiough + +**Advisors:** Stella Biderman, Peter Henderson, Yacine Jernite, Sasha Luccioni, Percy Liang, Arvind Narayanan, Victor Sanh diff --git a/app/content/english/foundation-model-resources/_index.json b/app/content/english/foundation-model-resources/_index.json new file mode 100644 index 0000000..5f2a7c4 --- /dev/null +++ b/app/content/english/foundation-model-resources/_index.json @@ -0,0 +1,54 @@ +{ + "@context": "https://schema.org", + "@type": "FAQPage", + "mainEntity": [ + { + "@type": "Question", + "name": "What is the foundation model cheatsheet?", + "acceptedAnswer": { + "@type": "Answer", + "text": "The foundation model cheatsheet is a curated collection of tools, datasets, code examples, and papers that guide the development of foundation models (large language models, image generators, etc.). It's designed for newer developers who want to build and release these models responsibly." + } + }, + { + "@type": "Question", + "name": "Why was this cheatsheet created?", + "acceptedAnswer": { + "@type": "Answer", + "text": "This cheatsheet was created to: 1) Help developers navigate the complex landscape of responsible foundation model development. 2) Provide guidance on mitigating potential misuses or harms of these models. 3) Highlight helpful resources that might not be widely known." + } + }, + { + "@type": "Question", + "name": "What kinds of resources are included?", + "acceptedAnswer": { + "@type": "Answer", + "text": "The cheatsheet includes: data catalogs (especially for less common languages), tools for searching and analyzing data, repositories for evaluating models, and papers that summarize key development decisions." + } + }, + { + "@type": "Question", + "name": "How do I contribute to the cheatsheet?", + "acceptedAnswer": { + "@type": "Answer", + "text": "The cheatsheet is a living document! You can contribute new resources by following the instructions given on the website. Your contributions will be reviewed for relevance and quality." + } + }, + { + "@type": "Question", + "name": "Is this cheatsheet suitable for everyone?", + "acceptedAnswer": { + "@type": "Answer", + "text": "It's primarily aimed at newer foundation model developers. Larger organizations that build commercial AI products have additional factors to consider." + } + }, + { + "@type": "Question", + "name": "Does the cheatsheet cover all foundation model types?", + "acceptedAnswer": { + "@type": "Answer", + "text": "Currently, it's focused on text, vision, and speech models. The creators acknowledge that this is just a starting point." + } + } + ] +} diff --git a/app/content/english/foundation-model-resources/_index.md b/app/content/english/foundation-model-resources/_index.md new file mode 100644 index 0000000..6dc2ea2 --- /dev/null +++ b/app/content/english/foundation-model-resources/_index.md @@ -0,0 +1,10 @@ +--- +title: "Foundation Model Resources" +type: "fm-resource-category" +nofollow: true +date: '2023-12-26' +description: "Explore a curated collection of resources for responsible foundation model development. Find tools, artifacts, and insightful papers to empower your journey." +image: ai-tools.png +--- + +Welcome to our curated collection of resources for responsible foundation model development! Here, you'll find a diverse array of tools, artifacts, and insightful papers aimed at guiding developers in navigating the complexities of model development. Our selection criteria emphasize the usefulness, documentation quality, and community awareness of each resource. diff --git a/app/content/english/foundation-model-resources/ai-tools.png b/app/content/english/foundation-model-resources/ai-tools.png new file mode 100644 index 0000000..ccd90fb Binary files /dev/null and b/app/content/english/foundation-model-resources/ai-tools.png differ diff --git a/app/content/english/foundation-model-resources/buildcat.py b/app/content/english/foundation-model-resources/buildcat.py new file mode 100644 index 0000000..c0f06a7 --- /dev/null +++ b/app/content/english/foundation-model-resources/buildcat.py @@ -0,0 +1,28 @@ +import os +import json +from slugify import slugify + +# Read the JSON file +with open('/home/iguana/WebstormProjects/fm-cheatsheet/assets/cats.json', 'r') as json_file: + categories = json.load(json_file) + +# Iterate through each category +for category_name, category_data in categories.items(): + # Create directory name from category name + directory_name = slugify(category_name) + # Create directory if it doesn't exist + os.makedirs(directory_name, exist_ok=True) + # Create index.md file path + index_file_path = os.path.join(directory_name, 'index.md') + # Write contents to index.md + with open(index_file_path, 'w') as index_file: + index_file.write('---\n') + index_file.write(f'title: "{category_name}"\n') + index_file.write(f'short_name: "{category_name}"\n') + index_file.write('type: "fm-resource-category"\n') + index_file.write('date: "2024-03-17"\n') # Update date as needed + index_file.write(f'description: "{category_data["meta_description"]}"\n') # meta description here + index_file.write('highlight: true\n') + index_file.write(f'image: {directory_name}.png\n') # Update image as needed + index_file.write(f'details: "{category_data["description"]}"\n') # description here + index_file.write('---\n') diff --git a/app/content/english/foundation-model-resources/data-auditing/data-auditing.png b/app/content/english/foundation-model-resources/data-auditing/data-auditing.png new file mode 100644 index 0000000..451e7e4 Binary files /dev/null and b/app/content/english/foundation-model-resources/data-auditing/data-auditing.png differ diff --git a/app/content/english/foundation-model-resources/data-auditing/index.md b/app/content/english/foundation-model-resources/data-auditing/index.md new file mode 100644 index 0000000..e7ad0d0 --- /dev/null +++ b/app/content/english/foundation-model-resources/data-auditing/index.md @@ -0,0 +1,11 @@ +--- +title: "Data Auditing Resources for Foundation Models" +short_name: "Data Auditing" +type: "fm-resource-category" +date: "2024-03-17" +description: "Discover the importance of auditing datasets in foundation model development. Learn how systematic studies and exploration tools can ensure dataset integrity and effectiveness." +section_id: 2.5 +highlight: true +image: data-auditing.png +details: "Auditing datasets is essential, spend a substantial amount of time inspecting your dataset at multiple stages of the dataset design process. Many datasets have problems specifically because the authors did not do sufficient auditing before releasing them. Use systematic studies of the process in addition to data search, analysis, & exploration tools to track the dataset's evolution." +--- diff --git a/app/content/english/foundation-model-resources/data-cleaning-filtering-mixing/data-cleaning-filtering-mixing.png b/app/content/english/foundation-model-resources/data-cleaning-filtering-mixing/data-cleaning-filtering-mixing.png new file mode 100644 index 0000000..b501caf Binary files /dev/null and b/app/content/english/foundation-model-resources/data-cleaning-filtering-mixing/data-cleaning-filtering-mixing.png differ diff --git a/app/content/english/foundation-model-resources/data-cleaning-filtering-mixing/index.md b/app/content/english/foundation-model-resources/data-cleaning-filtering-mixing/index.md new file mode 100644 index 0000000..441921a --- /dev/null +++ b/app/content/english/foundation-model-resources/data-cleaning-filtering-mixing/index.md @@ -0,0 +1,11 @@ +--- +title: "Data Cleaning, Filtering, & Mixing Resources for Foundation Models" +short_name: "Data Cleaning" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 2.2 +description: "Master data cleaning, filtering, and mixing techniques for foundation model datasets." +highlight: true +image: data-cleaning-filtering-mixing.png +details: "Data quality is crucial. Filtering can remove unwanted data, improving training efficiency and ensuring desirable properties like high information content, desired languages, low toxicity, and minimal personally identifiable information. Consider trade-offs when using filters and understand the importance of data mixtures." +--- diff --git a/app/content/english/foundation-model-resources/data-decontamination/data-decontamination.png b/app/content/english/foundation-model-resources/data-decontamination/data-decontamination.png new file mode 100644 index 0000000..f65e5d9 Binary files /dev/null and b/app/content/english/foundation-model-resources/data-decontamination/data-decontamination.png differ diff --git a/app/content/english/foundation-model-resources/data-decontamination/index.md b/app/content/english/foundation-model-resources/data-decontamination/index.md new file mode 100644 index 0000000..43d4078 --- /dev/null +++ b/app/content/english/foundation-model-resources/data-decontamination/index.md @@ -0,0 +1,11 @@ +--- +title: "Data Decontamination Resources for Foundation Models" +short_name: "Data Decontamination" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 2.4 +description: "Explore data decontamination techniques for foundation model training datasets. Learn how to protect test data integrity and ensure reliable model evaluation with canaries and proactive decontamination methods." +highlight: true +image: data-decontamination.png +details: "Data decontamination is the process of removing evaluation data from the training set. This step ensures the integrity of model evaluation. The following resources aid in proactively protecting test data with canaries, decontaminating data before training, and identifying or proving what data a model was trained on." +--- diff --git a/app/content/english/foundation-model-resources/data-deduplication/data-deduplication.png b/app/content/english/foundation-model-resources/data-deduplication/data-deduplication.png new file mode 100644 index 0000000..633874c Binary files /dev/null and b/app/content/english/foundation-model-resources/data-deduplication/data-deduplication.png differ diff --git a/app/content/english/foundation-model-resources/data-deduplication/index.md b/app/content/english/foundation-model-resources/data-deduplication/index.md new file mode 100644 index 0000000..dfa332c --- /dev/null +++ b/app/content/english/foundation-model-resources/data-deduplication/index.md @@ -0,0 +1,11 @@ +--- +title: "Data Deduplication Resources for Foundation Models" +short_name: "Data Deduplication" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 2.3 +description: "Learn about data deduplication, a crucial preprocessing step for foundation model datasets. Discover how removing duplicates enhances model training efficiency and reduces the risk of memorizing undesirable information." +highlight: true +image: data-deduplication.png +details: "Removing data duplicates can 1) reduce the likelihood of memorizing undesirable pieces of information such as boilerplate text, copyrighted data, and personally identifiable information, 2) improves training efficiency by reducing the total dataset size. Practitioners should always determine whether duplicated data will harm or help the model for their use case." +--- diff --git a/app/content/english/foundation-model-resources/data-documentation/data-documentation.png b/app/content/english/foundation-model-resources/data-documentation/data-documentation.png new file mode 100644 index 0000000..c0f7296 Binary files /dev/null and b/app/content/english/foundation-model-resources/data-documentation/data-documentation.png differ diff --git a/app/content/english/foundation-model-resources/data-documentation/index.md b/app/content/english/foundation-model-resources/data-documentation/index.md new file mode 100644 index 0000000..2811368 --- /dev/null +++ b/app/content/english/foundation-model-resources/data-documentation/index.md @@ -0,0 +1,11 @@ +--- +title: "Data Documentation Resources for Foundation Models" +short_name: "Data Documentation" +type: "fm-resource-category" +date: "2024-03-17" +description: "Understand the significance of data documentation for foundation model datasets. Thorough documentation ensures users understand data usage, legal restrictions, and privacy concerns, despite potential errors in crowdsourced documentation." +section_id: 3.1 +highlight: true +image: data-documentation.png +details: "Data documentation allows users to understand their intended uses, legal restrictions, attribution, relevant contents, privacy concerns, and other limitations. Many data documentation standards have been proposed, but their adoption has been uneven. It is important to recognize that crowdsourced documentation may contain errors and omissions." +--- diff --git a/app/content/english/foundation-model-resources/data-governance/data-governance.png b/app/content/english/foundation-model-resources/data-governance/data-governance.png new file mode 100644 index 0000000..eb4f9b1 Binary files /dev/null and b/app/content/english/foundation-model-resources/data-governance/data-governance.png differ diff --git a/app/content/english/foundation-model-resources/data-governance/index.md b/app/content/english/foundation-model-resources/data-governance/index.md new file mode 100644 index 0000000..2183064 --- /dev/null +++ b/app/content/english/foundation-model-resources/data-governance/index.md @@ -0,0 +1,11 @@ +--- +title: "Data Governance Resources for Foundation Models" +short_name: "Data Governance" +type: "fm-resource-category" +date: "2024-03-17" +description: "Explore data governance practices for foundation model datasets. Learn about data curation, access control, and enabling data subjects to request removal from hosted datasets to ensure compliance with privacy and legal requirements." +highlight: true +section_id: 3.2 +image: data-governance.png +details: "Releasing all datasets involved in the development of a Foundation Model, including training, fine-tuning, and evaluation data, can facilitate external scrutiny and support further research. Proper data governance practices, including respecting opt-out preference signals, pseudonymization, or PII redaction, are required at the curation and release stages. Data access control based on research needs and enabling data subjects to request removal from the hosted dataset are essential." +--- diff --git a/app/content/english/foundation-model-resources/data-search-analysis-exploration/data-search-analysis-exploration.png b/app/content/english/foundation-model-resources/data-search-analysis-exploration/data-search-analysis-exploration.png new file mode 100644 index 0000000..e528f13 Binary files /dev/null and b/app/content/english/foundation-model-resources/data-search-analysis-exploration/data-search-analysis-exploration.png differ diff --git a/app/content/english/foundation-model-resources/data-search-analysis-exploration/index.md b/app/content/english/foundation-model-resources/data-search-analysis-exploration/index.md new file mode 100644 index 0000000..3604093 --- /dev/null +++ b/app/content/english/foundation-model-resources/data-search-analysis-exploration/index.md @@ -0,0 +1,11 @@ +--- +title: "Data Search, Analysis, & Exploration Resources for Foundation Models" +short_name: "Data Exploration" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 2.1 +description: "Learn how to explore and analyze training datasets effectively for foundation models. Understand the nuances of data distributions, topics, and formats to better train your model." +highlight: true +image: data-search-analysis-exploration.png +details: "Exploring training datasets with search and analysis tools helps practitioners develop a nuanced intuition for what is in the data, and therefore their model. Data can be difficult to understand, summarize or document without hands-on exploration." +--- diff --git a/app/content/english/foundation-model-resources/environmental-impact/environmental-impact-card-template.png b/app/content/english/foundation-model-resources/environmental-impact/environmental-impact-card-template.png new file mode 100644 index 0000000..462dee8 Binary files /dev/null and b/app/content/english/foundation-model-resources/environmental-impact/environmental-impact-card-template.png differ diff --git a/app/content/english/foundation-model-resources/environmental-impact/environmental-impact.png b/app/content/english/foundation-model-resources/environmental-impact/environmental-impact.png new file mode 100644 index 0000000..1662c8f Binary files /dev/null and b/app/content/english/foundation-model-resources/environmental-impact/environmental-impact.png differ diff --git a/app/content/english/foundation-model-resources/environmental-impact/index.md b/app/content/english/foundation-model-resources/environmental-impact/index.md new file mode 100644 index 0000000..07e1989 --- /dev/null +++ b/app/content/english/foundation-model-resources/environmental-impact/index.md @@ -0,0 +1,11 @@ +--- +title: "Environmental Impact Resources for Foundation Models" +short_name: "Environmental Impact" +type: "fm-resource-category" +date: "2024-03-17" +description: "Explore resources for estimating and mitigating the environmental impact of foundation model development. Learn about tools and methodologies for measuring energy consumption during training or inference and minimizing environmental impact throughout the model lifecycle." +highlight: true +section_id: 5.1 +image: environmental-impact.png +details: "Foundation model development is often resource intensive. The following tools help one to measure energy consumption and estimate the carbon intensity of the energy source used. Decisions made during or prior to model training can have a significant effect on the upstream and downstream environmental impact of a given model." +--- diff --git a/app/content/english/foundation-model-resources/finetuning-data-catalogs/finetuning-data-catalogs.png b/app/content/english/foundation-model-resources/finetuning-data-catalogs/finetuning-data-catalogs.png new file mode 100644 index 0000000..54f7734 Binary files /dev/null and b/app/content/english/foundation-model-resources/finetuning-data-catalogs/finetuning-data-catalogs.png differ diff --git a/app/content/english/foundation-model-resources/finetuning-data-catalogs/index.md b/app/content/english/foundation-model-resources/finetuning-data-catalogs/index.md new file mode 100644 index 0000000..33f26a6 --- /dev/null +++ b/app/content/english/foundation-model-resources/finetuning-data-catalogs/index.md @@ -0,0 +1,11 @@ +--- +title: "Finetuning Data Catalogs for Foundation Models" +short_name: "Finetuning Data Catalogs" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 1.2 +description: "Discover the breadth of finetuning data sources available for foundation models. From HuggingFace Datasets to specialized catalogs, find resources with strong documentation and diverse data sets." +highlight: true +image: finetuning-data-catalogs.png +details: "Finetuning or adaptation of foundation models is a complex step in model development. These models are more frequently deployed than base models. Here, we link to some useful and widely-used resources for finetuning." +--- diff --git a/app/content/english/foundation-model-resources/license-selection/index.md b/app/content/english/foundation-model-resources/license-selection/index.md new file mode 100644 index 0000000..11bf8ba --- /dev/null +++ b/app/content/english/foundation-model-resources/license-selection/index.md @@ -0,0 +1,11 @@ +--- +title: "License Selection Resources for Foundation Models" +short_name: "License Selection" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 7.3 +description: "Explore license selection considerations for foundation models. Learn about different types of licenses and their implications for model distribution, use, and adaptation. Discover resources and examples to help guide developers in selecting appropriate licenses for their models." +highlight: true +image: license-selection.png +details: "Foundation models, like software, are accompanied by licenses that determine how they may be distributed, used, and repurposed. The following resources can help one determine which type of license to use." +--- diff --git a/app/content/english/foundation-model-resources/license-selection/license-selection.png b/app/content/english/foundation-model-resources/license-selection/license-selection.png new file mode 100644 index 0000000..13f839a Binary files /dev/null and b/app/content/english/foundation-model-resources/license-selection/license-selection.png differ diff --git a/app/content/english/foundation-model-resources/model-documentation/index.md b/app/content/english/foundation-model-resources/model-documentation/index.md new file mode 100644 index 0000000..9fc11e7 --- /dev/null +++ b/app/content/english/foundation-model-resources/model-documentation/index.md @@ -0,0 +1,11 @@ +--- +title: "Model Documentation Resources" +short_name: "Model Documentation" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 7.1 +description: "Learn about model documentation for foundation models. Discover standards and tools for effectively documenting models, including specifications for model usage, recommended use cases, potential risks, and decisions made during training." +highlight: true +image: model-documentation.png +details: "It is important to document models that are used and released. Even models and code released openly are important to document thoroughly, in order to specify how to use the model, recommended and non-recommended use cases, potential harms, state or justify decisions made during training, and more. The following tools can help with documentation." +--- diff --git a/app/content/english/foundation-model-resources/model-documentation/model-documentation.png b/app/content/english/foundation-model-resources/model-documentation/model-documentation.png new file mode 100644 index 0000000..8b68634 Binary files /dev/null and b/app/content/english/foundation-model-resources/model-documentation/model-documentation.png differ diff --git a/app/content/english/foundation-model-resources/model-evaluation-capabilities/index.md b/app/content/english/foundation-model-resources/model-evaluation-capabilities/index.md new file mode 100644 index 0000000..4534882 --- /dev/null +++ b/app/content/english/foundation-model-resources/model-evaluation-capabilities/index.md @@ -0,0 +1,11 @@ +--- +title: "Resources for Model Evaluation Capabilities" +short_name: "Capabilities" +type: "fm-resource-category" +date: "2024-03-17" +description: "Explore evaluation capabilities for foundation models. Understand the challenges in evaluating open-ended use cases and discover benchmarks and methodologies for assessing model performance in diverse tasks and applications." +highlight: true +section_id: 6.1 +image: model-evaluation-capabilities.png +details: "Many modern foundation models are released with general abilities, such that their use cases are poorly specified and open-ended, posing significant challenges to evaluation benchmarks which are unable to critically evaluate so many tasks, applications, and risks systematically or fairly. It is important to carefully scope the original intentions for the model, and the evaluations to those intentions." +--- diff --git a/app/content/english/foundation-model-resources/model-evaluation-capabilities/model-evaluation-capabilities.png b/app/content/english/foundation-model-resources/model-evaluation-capabilities/model-evaluation-capabilities.png new file mode 100644 index 0000000..8bf0b56 Binary files /dev/null and b/app/content/english/foundation-model-resources/model-evaluation-capabilities/model-evaluation-capabilities.png differ diff --git a/app/content/english/foundation-model-resources/model-evaluation-risks-harms-taxonomies/index.md b/app/content/english/foundation-model-resources/model-evaluation-risks-harms-taxonomies/index.md new file mode 100644 index 0000000..697b2d4 --- /dev/null +++ b/app/content/english/foundation-model-resources/model-evaluation-risks-harms-taxonomies/index.md @@ -0,0 +1,11 @@ +--- +title: "Risks & Harms Taxonomy Resources for Foundation Models" +short_name: "Risks & Harms Taxonomies" +type: "fm-resource-category" +date: "2024-03-17" +description: "Discover taxonomies for evaluating risks and harms in foundation models. Learn about categorizing and understanding risks and hazards associated with AI systems, including issues related to hate speech, cybersecurity, and misuse of AI capabilities." +highlight: true +section_id: 6.2 +image: model-evaluation-risks-harms-taxonomies.png +details: "Taxonomies provide a way of categorising, defining and understanding risks and hazards created through the use and deployment of AI systems. The following taxonomies focus on the types of interactions and uses that create a risk of harm as well as the negative effects that they lead to." +--- diff --git a/app/content/english/foundation-model-resources/model-evaluation-risks-harms-taxonomies/model-evaluation-risks-harms-taxonomies.png b/app/content/english/foundation-model-resources/model-evaluation-risks-harms-taxonomies/model-evaluation-risks-harms-taxonomies.png new file mode 100644 index 0000000..320a36f Binary files /dev/null and b/app/content/english/foundation-model-resources/model-evaluation-risks-harms-taxonomies/model-evaluation-risks-harms-taxonomies.png differ diff --git a/app/content/english/foundation-model-resources/model-evaluation-risks-harms/index.md b/app/content/english/foundation-model-resources/model-evaluation-risks-harms/index.md new file mode 100644 index 0000000..af1431f --- /dev/null +++ b/app/content/english/foundation-model-resources/model-evaluation-risks-harms/index.md @@ -0,0 +1,11 @@ +--- +title: "Risk & Harms Evaluation Resources for Foundation Models" +short_name: "Risks & Harms Evaluation" +type: "fm-resource-category" +date: "2024-03-17" +description: "Explore evaluations of risks and harms in foundation models. Understand the importance of assessing risks and harms, and discover methodologies and taxonomies for evaluating potential risks, mitigations, and decision-making in model development and deployment." +highlight: true +section_id: 6.3 +image: model-evaluation-risks-harms.png +details: "The following tools for evaluating risk serve multiple purposes: to identify if there are issues which need mitigation, to track the success of any such mitigations, to document for other users of the model what risks are still present, and to help make decisions related to model access and release." +--- diff --git a/app/content/english/foundation-model-resources/model-evaluation-risks-harms/model-evaluation-risks-harms.png b/app/content/english/foundation-model-resources/model-evaluation-risks-harms/model-evaluation-risks-harms.png new file mode 100644 index 0000000..9d4ef1a Binary files /dev/null and b/app/content/english/foundation-model-resources/model-evaluation-risks-harms/model-evaluation-risks-harms.png differ diff --git a/app/content/english/foundation-model-resources/model-training-educational-resources/index.md b/app/content/english/foundation-model-resources/model-training-educational-resources/index.md new file mode 100644 index 0000000..952e525 --- /dev/null +++ b/app/content/english/foundation-model-resources/model-training-educational-resources/index.md @@ -0,0 +1,11 @@ +--- +title: "Educational Resources for Foundation Model Training" +short_name: "Additional Educational Resources" +type: "fm-resource-category" +date: "2024-03-17" +description: "Discover educational resources for foundation model training. Access materials to learn about the considerations and best practices for successfully training or fine-tuning foundation models." +highlight: true +section_id: 4.4 +image: model-training-educational-resources.png +details: "Training models at any scale can be quite daunting to newer practitioners. The following educational resources may be useful in learning about the considerations required for successfully and effectively training or fine-tuning foundation models." +--- diff --git a/app/content/english/foundation-model-resources/model-training-educational-resources/model-training-educational-resources.png b/app/content/english/foundation-model-resources/model-training-educational-resources/model-training-educational-resources.png new file mode 100644 index 0000000..fe691a4 Binary files /dev/null and b/app/content/english/foundation-model-resources/model-training-educational-resources/model-training-educational-resources.png differ diff --git a/app/content/english/foundation-model-resources/model-training-efficiency-resource-allocation/index.md b/app/content/english/foundation-model-resources/model-training-efficiency-resource-allocation/index.md new file mode 100644 index 0000000..185b4e8 --- /dev/null +++ b/app/content/english/foundation-model-resources/model-training-efficiency-resource-allocation/index.md @@ -0,0 +1,11 @@ +--- +title: "Resources for Model Training: Efficiency & Resource Allocation" +short_name: "Efficiency & Resource Allocation" +type: "fm-resource-category" +date: "2024-03-17" +description: "Learn about efficiency and resource allocation in foundation model training. Explore resources and best practices for optimizing resource usage, reducing training costs, and maximizing the environmental impact of model training." +highlight: true +section_id: 4.3 +image: model-training-efficiency-resource-allocation.png +details: "Knowledge of training best practices can reduce the cost of training a desired model significantly. Here, we link to readings and resources on effectively using a given resource budget for model training, including canonical papers on fitting scaling laws." +--- diff --git a/app/content/english/foundation-model-resources/model-training-efficiency-resource-allocation/model-training-efficiency-resource-allocation.png b/app/content/english/foundation-model-resources/model-training-efficiency-resource-allocation/model-training-efficiency-resource-allocation.png new file mode 100644 index 0000000..2f1ea1d Binary files /dev/null and b/app/content/english/foundation-model-resources/model-training-efficiency-resource-allocation/model-training-efficiency-resource-allocation.png differ diff --git a/app/content/english/foundation-model-resources/model-training-finetuning-repositories/index.md b/app/content/english/foundation-model-resources/model-training-finetuning-repositories/index.md new file mode 100644 index 0000000..a464501 --- /dev/null +++ b/app/content/english/foundation-model-resources/model-training-finetuning-repositories/index.md @@ -0,0 +1,11 @@ +--- +title: "Finetuning Repositories for Foundation Model Training" +short_name: "Finetuning Repositories" +type: "fm-resource-category" +date: "2024-03-17" +description: "Explore finetuning repositories for foundation model development. Access resources for adapting foundation models after pretraining to ensure greater ecosystem compatibility and reduce barriers to experimentation." +highlight: true +section_id: 4.2 +image: model-training-finetuning-repositories.png +details: "Fine-tuning, or other types of adaptation performed on foundation models after pretraining, are an equally important and complex step in model development. Fine-tuned models are more frequently deployed than base models. Here, we also link to some useful and widely-used resources for adapting foundation models or otherwise fine-tuning them." +--- diff --git a/app/content/english/foundation-model-resources/model-training-finetuning-repositories/model-training-finetuning-repositories.png b/app/content/english/foundation-model-resources/model-training-finetuning-repositories/model-training-finetuning-repositories.png new file mode 100644 index 0000000..13e1d6f Binary files /dev/null and b/app/content/english/foundation-model-resources/model-training-finetuning-repositories/model-training-finetuning-repositories.png differ diff --git a/app/content/english/foundation-model-resources/model-training-pretraining-repositories/index.md b/app/content/english/foundation-model-resources/model-training-pretraining-repositories/index.md new file mode 100644 index 0000000..6fcdc24 --- /dev/null +++ b/app/content/english/foundation-model-resources/model-training-pretraining-repositories/index.md @@ -0,0 +1,11 @@ +--- +title: "Pretraining Repositories for Foundation Model Training" +short_name: "Pretraining Repositories" +type: "fm-resource-category" +date: "2024-03-17" +description: "Discover pretraining repositories for foundation model development. Explore existing open-source codebases tailored for pretraining to optimize computational resources and enhance accessibility for new practitioners." +highlight: true +section_id: 4.1 +image: model-training-pretraining-repositories.png +details: "Practitioners should consider using already-optimized codebases, especially in the pre-training phase, to ensure effective use of computational resources, capital, power, and effort. Existing open-source codebases targeted at foundation model pretraining can make pretraining significantly more accessible to new practitioners and help accumulate techniques for efficiency in model training." +--- diff --git a/app/content/english/foundation-model-resources/model-training-pretraining-repositories/model-training-pretraining-repositories.png b/app/content/english/foundation-model-resources/model-training-pretraining-repositories/model-training-pretraining-repositories.png new file mode 100644 index 0000000..cc6f4b5 Binary files /dev/null and b/app/content/english/foundation-model-resources/model-training-pretraining-repositories/model-training-pretraining-repositories.png differ diff --git a/app/content/english/foundation-model-resources/pretraining-data-sources/index.md b/app/content/english/foundation-model-resources/pretraining-data-sources/index.md new file mode 100644 index 0000000..1e4ed33 --- /dev/null +++ b/app/content/english/foundation-model-resources/pretraining-data-sources/index.md @@ -0,0 +1,11 @@ +--- +title: "Pretraining Data Sources" +short_name: "Pretraining Data Sources" +type: "fm-resource-category" +section_id: 1.1 +date: "2024-03-17" +description: "Understand the importance of pretraining data for foundation models. Careful data selection impacts model behavior and capabilities." +highlight: true +image: pretraining-data-sources.png +details: "Practitioners should consider using already-optimized codebases, especially in the pre-training phase, to ensure effective use of computational resources, capital, power, and effort. Existing open-source codebases targeted at foundation model pretraining can be significantly more accessible to new practitioners and help contribute to efficient training strategies." +--- diff --git a/app/content/english/foundation-model-resources/pretraining-data-sources/pretraining-data-sources.png b/app/content/english/foundation-model-resources/pretraining-data-sources/pretraining-data-sources.png new file mode 100644 index 0000000..2526365 Binary files /dev/null and b/app/content/english/foundation-model-resources/pretraining-data-sources/pretraining-data-sources.png differ diff --git a/app/content/english/foundation-model-resources/reproducibility/index.md b/app/content/english/foundation-model-resources/reproducibility/index.md new file mode 100644 index 0000000..8fd094c --- /dev/null +++ b/app/content/english/foundation-model-resources/reproducibility/index.md @@ -0,0 +1,11 @@ +--- +title: "Reproducibility Resources" +short_name: "Reproducibility" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 7.2 +description: "Understand the importance of reproducibility in foundation model development. Learn about the challenges of replicating evaluation results and discover best practices for ensuring scientific reproducibility through clear code, documentation, and setup." +highlight: true +image: reproducibility.png +details: "Model releases accompanied with claims on performance that are not reproducible, code that is unavailable, incomplete, or difficult to run costs the scientific community time and effort. The following resources are valuable to help others replicate and verify the claims." +--- diff --git a/app/content/english/foundation-model-resources/reproducibility/reproducibility.png b/app/content/english/foundation-model-resources/reproducibility/reproducibility.png new file mode 100644 index 0000000..8cc5bba Binary files /dev/null and b/app/content/english/foundation-model-resources/reproducibility/reproducibility.png differ diff --git a/app/content/english/foundation-model-resources/usage-monitoring/index.md b/app/content/english/foundation-model-resources/usage-monitoring/index.md new file mode 100644 index 0000000..0d4adcd --- /dev/null +++ b/app/content/english/foundation-model-resources/usage-monitoring/index.md @@ -0,0 +1,11 @@ +--- +title: "Usage Monitoring for Foundation Models" +short_name: "Usage Monitoring" +type: "fm-resource-category" +date: "2024-03-17" +section_id: 7.4 +description: "Discover resources for usage monitoring in foundation model development. Explore techniques for monitoring model usage, including watermarking, access control, and reporting adverse events. Learn about challenges and considerations in implementing usage monitoring strategies." +highlight: true +image: usage-monitoring.png +details: "Monitoring foundation model usage is an evolving area of research. The following techniques, such as watermarking model outputs or gating access to the model, are some of the ways to do so." +--- diff --git a/app/content/english/foundation-model-resources/usage-monitoring/usage-monitoring.png b/app/content/english/foundation-model-resources/usage-monitoring/usage-monitoring.png new file mode 100644 index 0000000..50ef42c Binary files /dev/null and b/app/content/english/foundation-model-resources/usage-monitoring/usage-monitoring.png differ diff --git a/app/content/english/index.json b/app/content/english/index.json new file mode 100644 index 0000000..8c825be --- /dev/null +++ b/app/content/english/index.json @@ -0,0 +1,15 @@ +{ + "@context": "http://schema.org", + "@type": "Organization", + "url": "https://fmcheatsheet.org", + "logo": "https://fmcheatsheet.org/images/foundation-models.png", + "name": "Foundation Model Cheasheet", + "brand": "Foundation Model Cheatsheet", + "keywords": "Foundation Models, AI Models, AI Research, ML Tools, AI Resources", + "slogan": "Resources for Foundation Model Development", + "mainEntityOfPage": { + "@type": "WebSite", + "@id": "https://fmcheatsheet.org" + }, + "description": "Foundation Model Cheat Sheet is a community-driven resource for AI researchers and developers to discover, compare, and use the latest foundation models and tools." +} diff --git a/app/data/social.json b/app/data/social.json new file mode 100644 index 0000000..2c890d6 --- /dev/null +++ b/app/data/social.json @@ -0,0 +1,9 @@ +{ + "main": [ + { + "name": "github", + "icon": "fab fa-github", + "link": "https://github.com/allenai/fm-cheatsheet" + } + ] +} diff --git a/app/data/theme.json b/app/data/theme.json new file mode 100644 index 0000000..0cce1fc --- /dev/null +++ b/app/data/theme.json @@ -0,0 +1,47 @@ +{ + "colors": { + "default": { + "theme_color": { + "primary": "#01061e", + "body": "#f1f5f9", + "border": "#03071e", + "theme_light": "#FFF", + "theme_dark": "", + "header_top": "", + "header_bottom": "#000" + }, + "text_color": { + "default": "#444444", + "dark": "#040404", + "light": "#717171" + } + }, + "darkmode": { + "theme_color": { + "primary": "#38ffc7", + "body": "#00000e", + "border": "#628996", + "theme_light": "#0f172a", + "theme_dark": "#000" + }, + "text_color": { + "default": "#FFF", + "dark": "#FFF", + "light": "#B4AFB6" + } + } + }, +"fonts": { + "font_family": { + "primary": "Inter:wght@400;600", + "primary_type": "sans-serif", + "secondary": "Work Sans:wght@500;700", + "secondary_type": "sans-serif" + }, + "font_size": { + "base": "16", + "scale": "1.250" + } +} + +} diff --git a/app/go.mod b/app/go.mod new file mode 100644 index 0000000..630e123 --- /dev/null +++ b/app/go.mod @@ -0,0 +1,23 @@ +module hugoplate.netlify.app + +go 1.20 + +require ( + github.com/gethugothemes/hugo-modules/accordion v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/components/cookie-consent v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/components/custom-script v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/components/render-link v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/components/social-share v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/gallery-slider v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/gzip-caching v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/icons/font-awesome v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/images v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/modal v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/search v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/seo-tools/basic-seo v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/seo-tools/site-verifications v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/shortcodes/button v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/shortcodes/notice v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/tab v0.0.0-20240228105219-38c9f9c6e062 // indirect + github.com/gethugothemes/hugo-modules/videos v0.0.0-20240228105219-38c9f9c6e062 // indirect +) diff --git a/app/hugo.toml b/app/hugo.toml new file mode 100755 index 0000000..2bc882a --- /dev/null +++ b/app/hugo.toml @@ -0,0 +1,161 @@ +######################## default configuration #################### +# The base URL of your site (required). This will be prepended to all relative URLs. +# baseURL = "https://fmcheatsheet.org/" +relativeURLs = true +# Title of your website (required). +title = "Foundation Model Cheatsheet" +# Your theme name +theme = "hugoplate" +# Default time zone for time stamps; use any valid tz database name: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List +timeZone = "America/New_York" +# post pagination +paginate = 100 # see https://gohugo.io/extras/pagination/ +# post excerpt +summaryLength = 10 # see https://gohugo.io/content-management/excerpts/ +# google analytics +googleAnalytics = "G-MEASUREMENT_ID" # see https://gohugo.io/templates/internal/#configure-google-analytics +# disqus short name +#disqusShortname = "" # we use disqus to show comments in blog posts . To install disqus please follow this tutorial https://portfolio.peter-baumgartner.net/2017/09/10/how-to-install-disqus-on-hugo/ +# disable language +disableLanguages = [ +] # example: ["fr"] for disable french language. see https://gohugo.io/content-management/multilingual/ +hasCJKLanguage = false # If hasCJKLanguage true, auto-detect Chinese/Japanese/Korean Languages in the content. see: https://gohugo.io/getting-started/configuration/#hascjklanguage + +########################## Permalinks ############################ +[permalinks.page] +"pages" = "/:slug/" + + +############################# Modules ############################ +[module] +[[module.mounts]] +source = "assets" +target = "assets" + +[[module.mounts]] +source = "hugo_stats.json" +target = "assets/watching/hugo_stats.json" + +############################# Build ############################## +[build] +noJSConfigInAssets = false +useResourceCacheWhen = 'fallback' +disableFastRender = true +[build.buildStats] +enable = true +[[build.cachebusters]] +source = 'assets/watching/hugo_stats\.json' +target = 'style\.css' +[[build.cachebusters]] +source = '(postcss|tailwind)\.config\.js' +target = 'css' +[[build.cachebusters]] +source = 'assets/.*\.(js|ts|jsx|tsx)' +target = 'js' +[[build.cachebusters]] +source = 'assets/.*\.(css|scss|sass)' +target = 'css' +[[build.cachebusters]] +source = 'assets/.*\.(.*)$' +target = '$1' + + +############################# Outputs ############################ +[outputs] +home = ["HTML", "RSS", "SearchIndex"] + +############################# Imaging ############################ +[imaging] +# See https://github.com/disintegration/imaging +# Default JPEG or WebP quality setting. Default is 75. +quality = 90 +resampleFilter = "Lanczos" + +############################ Caches ############################## +[caches] +[caches.images] +dir = ":resourceDir/_gen" +maxAge = "720h" + +[caches.assets] +dir = ":resourceDir/_gen" +maxAge = "720h" + + +############################ Markup ############################## +[markup] +[markup.goldmark.renderer] +unsafe = true + +[markup.highlight] +#style = 'monokai' # see https://xyproto.github.io/splash/docs/all.html +#style = 'catppuccin-macchiato' # see https://xyproto.github.io/splash/docs/all.html +style = 'friendly' # see https://xyproto.github.io/splash/docs/all.html + +[markup.tableOfContents] +startLevel = 2 +endLevel = 5 +ordered = true + +########################### Media types ########################### +[mediaTypes] +[mediaTypes."application/manifest+json"] +suffixes = ["webmanifest"] + +########################### Output Format ########################## +[outputFormats] +[outputFormats.WebAppManifest] +mediaType = "application/manifest+json" +rel = "manifest" + +[outputFormats.SearchIndex] +mediaType = "application/json" +baseName = "searchindex" +isPlainText = true +notAlternative = true + + +############################# Plugins ############################## + +# CSS Plugins +[[params.plugins.css]] +link = "plugins/swiper/swiper-bundle.css" +[[params.plugins.css]] +link = "plugins/glightbox/glightbox.css" +[[params.plugins.css]] +link = "plugins/font-awesome/v6/brands.css" +[[params.plugins.css]] +link = "plugins/font-awesome/v6/solid.css" +[[params.plugins.css]] +link = "plugins/font-awesome/v6/icons.css" + +# JS Plugins +[[params.plugins.js]] +link = "js/search.js" +[[params.plugins.js]] +link = "plugins/swiper/swiper-bundle.js" +[[params.plugins.js]] +link = "plugins/glightbox/glightbox.js" +[[params.plugins.js]] +link = "js/gallery-slider.js" +[[params.plugins.js]] +link = "js/accordion.js" +[[params.plugins.js]] +link = "js/tab.js" +[[params.plugins.js]] +link = "js/modal.js" +[[params.plugins.js]] +link = "plugins/cookie.js" +[[params.plugins.js]] +link = "plugins/youtube-lite.js" + +[sitemap] +changeFreq = '' +filename = 'sitemap.xml' +priority = -1 + +disableKinds = ['homepage'] + +staticDir = 'static' + +[Taxonomies] diff --git a/app/i18n/en.yaml b/app/i18n/en.yaml new file mode 100755 index 0000000..ae07aa4 --- /dev/null +++ b/app/i18n/en.yaml @@ -0,0 +1,32 @@ +- id: home + translation: Home + +- id: read_more + translation: Read More + +- id: send + translation: Send + +- id: related_posts + translation: Related Posts + +- id: categories + translation: Categories + +- id: tags + translation: Tags + +- id: toc + translation: Table of Contents + +- id: share + translation: Share + +- id: search_input_placeholder + translation: Search Post ... + +- id: no_results_for + translation: No results for + +- id: empty_search_results_placeholder + translation: Type something to search.. diff --git a/app/netlify.toml b/app/netlify.toml new file mode 100755 index 0000000..f541a08 --- /dev/null +++ b/app/netlify.toml @@ -0,0 +1,12 @@ +[build] +publish = "public" +command = "yarn project-setup; yarn build" + +[build.environment] +HUGO_VERSION = "0.115.4" +GO_VERSION = "1.20.5" + +[headers] + for = "/*" + [headers.values] + Access-Control-Allow-Origin = "*" diff --git a/app/nginx.conf b/app/nginx.conf new file mode 100644 index 0000000..202f88f --- /dev/null +++ b/app/nginx.conf @@ -0,0 +1,48 @@ +server { + listen 8000; + listen [::]:8000; + server_name localhost; + + + # Add CORS headers + add_header 'Access-Control-Allow-Origin' '*'; + add_header 'Access-Control-Allow-Methods' 'GET, POST, OPTIONS'; + add_header 'Access-Control-Allow-Headers' 'Origin, Content-Type, Accept, Authorization'; + + location / { + root /usr/share/nginx/html; + index index.html index.htm; + } + + #error_page 404 /404.html; + + # redirect server error pages to the static page /50x.html + # + error_page 500 502 503 504 /50x.html; + location = /50x.html { + root /usr/share/nginx/html; + } + + # proxy the PHP scripts to Apache listening on 127.0.0.1:80 + # + #location ~ \.php$ { + # proxy_pass http://127.0.0.1; + #} + + # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000 + # + #location ~ \.php$ { + # root html; + # fastcgi_pass 127.0.0.1:9000; + # fastcgi_index index.php; + # fastcgi_param SCRIPT_FILENAME /scripts$fastcgi_script_name; + # include fastcgi_params; + #} + + # deny access to .htaccess files, if Apache's document root + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} +} diff --git a/app/package.json b/app/package.json new file mode 100644 index 0000000..0f581c8 --- /dev/null +++ b/app/package.json @@ -0,0 +1,34 @@ +{ + "name": "hugoplate", + "description": "hugo tailwindcss boilerplate - fmcheatsheet", + "version": "1.7.5", + "license": "MIT", + "author": "zeon.studio", + "scripts": { + "dev": "hugo server --disableFastRender --noHTTPCache", + "build": "hugo --gc --minify --templateMetrics --templateMetricsHints --forceSyncStatic -e production --minify", + "build-j": "hugo --gc --minify --templateMetrics --templateMetricsHints --forceSyncStatic", + "test": "hugo server --disableFastRender --navigateToChanged --templateMetrics --templateMetricsHints --watch --forceSyncStatic -e production --minify", + "dev:example": "cd exampleSite; hugo server", + "build:example": "hugo --gc --minify --templateMetrics --templateMetricsHints --forceSyncStatic", + "test:example": "hugo server --disableFastRender --navigateToChanged --templateMetrics --templateMetricsHints --watch --forceSyncStatic -e production --minify", + "update-modules": "node ./scripts/clearModules.js && hugo mod clean --all && hugo mod get -u ./... && hugo mod tidy", + "remove-darkmode": "node ./scripts/removeDarkmode.js && yarn format", + "project-setup": "node ./scripts/projectSetup.js", + "theme-setup": "node ./scripts/themeSetup.js", + "format": "prettier -w ." + }, + "devDependencies": { + "@fullhuman/postcss-purgecss": "^5.0.0", + "@tailwindcss/forms": "^0.5.6", + "@tailwindcss/typography": "^0.5.9", + "autoprefixer": "^10.4.15", + "postcss": "^8.4.35", + "postcss-cli": "^10.1.0", + "prettier": "^3.0.2", + "prettier-plugin-go-template": "0.0.15", + "prettier-plugin-tailwindcss": "^0.5.3", + "tailwind-bootstrap-grid": "^5.0.1", + "tailwindcss": "^3.3.3" + } +} diff --git a/app/postcss.config.js b/app/postcss.config.js new file mode 100644 index 0000000..466b595 --- /dev/null +++ b/app/postcss.config.js @@ -0,0 +1,45 @@ +const purgecss = { + content: ["./hugo_stats.json"], + defaultExtractor: (content) => { + const elements = JSON.parse(content).htmlElements; + return [ + ...(elements.tags || []), + ...(elements.classes || []), + ...(elements.ids || []), + ]; + }, + safelist: [ + /^swiper-/, + /^lb-/, + /^gl/, + /^go/, + /^gc/, + /^gs/, + /^gi/, + /^gz/, + /^gprev/, + /^gnext/, + /^desc/, + /^zoom/, + /^search/, + /^:is/, + /dark/, + /show/, + /dragging/, + /fullscreen/, + /loaded/, + /visible/, + /current/, + /active/, + /after/, + ], +}; + +module.exports = { + plugins: { + tailwindcss: {}, + "@fullhuman/postcss-purgecss": + process.env.HUGO_ENVIRONMENT === "production" ? purgecss : false, + autoprefixer: process.env.HUGO_ENVIRONMENT === "production" ? {} : false, + }, +}; diff --git a/app/readme.md b/app/readme.md new file mode 100755 index 0000000..f299a12 --- /dev/null +++ b/app/readme.md @@ -0,0 +1,214 @@ +Hugoplate is a free starter template built with Hugo, and TailwindCSS, providing everything you need to jumpstart your Hugo project and save valuable time.
+ +Made with ♥ by Zeon Studio
+If you find this project useful, please give it a ⭐ to show your support.
+ ++ + + + + + + + + + + + + + + +
+ +## 🎁 What's Included + +We have included almost everything you need to start your Hugo project. Let's see what's included in this template: + +### 📌 Key Features + +- 👥 Multi-Authors +- 🎯 Similar Posts Suggestion +- 🔍 Search Functionality +- 🌑 Dark Mode +- 🏷️ Tags & Categories +- 🔗 Netlify setting pre-configured +- 📞 Support contact form +- 📱 Fully responsive +- 📝 Write and update content in Markdown +- 💬 Disqus Comment +- 🔳 Syntax Highlighting + +### 📄 15+ Pre-designed Pages + +- 🏠 Homepage +- 👤 About +- 📞 Contact +- 👥 Authors +- 👤 Author Single +- 📝 Blog +- 📝 Blog Single +- 🚫 Custom 404 +- 💡 Elements +- 📄 Privacy Policy +- 🏷️ Tags +- 🏷️ Tag Single +- 🗂️ Categories +- 🗂️ Category Single +- 🔍 Search + +### 📦 Tech Stack + +- [Hugo](https://gohugo.io/) +- [Tailwind CSS](https://tailwindcss.com/) +- [PostCSS](https://postcss.org/) +- [PurgeCSS](https://purgecss.com/) +- [AutoPrefixer](https://autoprefixer.github.io/) +- [Hugo Modules](https://gohugo.io/hugo-modules/) +- [Markdown](https://markdownguide.org/) +- [Prettier](https://prettier.io/) +- [Jshint](https://jshint.com/) +- [Netlify](https://www.netlify.com/) +- [Vercel](https://vercel.com/) +- [Github Actions](https://github.com/features/actions) +- [Gitlab Ci](https://docs.gitlab.com/ee/ci/) +- [AWS Amplify](https://aws.amazon.com/amplify/) + +--- + +## 🚀 Getting Started + +First you need to [clone](https://github.com/zeon-studio/hugoplate) or [download](https://github.com/zeon-studio/hugoplate/archive/refs/heads/main.zip) the template repository, and then let's get started with the following process: + +### ⚙️ Prerequisites + +To start using this template, you need to have some prerequisites installed on your machine. + +- [Hugo Extended v0.115+](https://gohugo.io/installation/) +- [Node v18+](https://nodejs.org/en/download/) +- [Go v1.20+](https://go.dev/doc/install) + +### 👉 Project Setup + +[//]: # (We build this custom script to make your project setup easier. It will create a new Hugo theme folder, and clone the Hugoplate theme into it. Then move the exampleSite folder into the root directory. So that you can start your Hugo server without going into the exampleSite folder. Use the following command to setup your project.) + +[//]: # () +[//]: # (```bash) + +[//]: # (npm run project-setup) + +[//]: # (```) + +### 👉 Install Dependencies + +Install all the dependencies using the following command. + +```bash +npm install +``` + +### 👉 Development Command + +Start the development server using the following command. + +```bash +npm run dev +``` + +### 🎬 Still Confused? Watch a Quick Video + +https://github.com/zeon-studio/hugoplate/assets/58769763/c260c0ae-91be-42ce-b8db-aa7f11f777bd + +--- + +## 📝 Customization + +This template has been designed with a lot of customization options in mind. You can customize almost anything you want, including: + +### 👉 Site Config + +You can change the site title, base URL, language, theme, plugins, and more from the `hugo.toml` file. + +### 👉 Site Params + +You can customize all the parameters from the `config/_default/params.toml` file. This includes the logo, favicon, search, SEO metadata, and more. + +### 👉 Colors and Fonts + +You can change the colors and fonts from the `data/theme.json` file. This includes the primary color, secondary color, font family, and font size. + +### 👉 Social Links + +You can change the social links from the `data/social.json` file. Add your social links here, and they will automatically be displayed on the site. + +--- + +## 🛠 Advanced Usage + +We have added some custom scripts to make your life easier. You can use these scripts to help you with your development. + +### 👉 Update Modules + +We have added a lot of modules to this template. You can update all the modules using the following command. + +```bash +npm run update-modules +``` + +### 👉 Remove Dark Mode + +If you want to remove dark mode from your project, then you have to do it manually from everywhere. So we build a custom script to do it for you. you can use the following command to remove dark mode from your project. + +```bash +npm run remove-darkmode +``` + +--- + +## 🚀 Build And Deploy + +After you finish your development, you can build or deploy your project almost everywhere. Let's see the process: + +### 👉 Build Command + +To build your project locally, you can use the following command. It will purge all the unused CSS and minify all the files. + +```bash +npm run build +``` + +### 👉 Deploy Site + +We have provided 5 different deploy platform configurations with this template, so you can deploy easily. + +- [Netlify](https://www.netlify.com/) +- [Vercel](https://vercel.com/) +- [Github Actions](https://github.com/features/actions) +- [Gitlab Ci](https://docs.gitlab.com/ee/ci/) +- [AWS Amplify](https://aws.amazon.com/amplify/) + +And if you want to Host some other hosting platforms. then you can build your project, and you will get a `public` folder. that you can copy and paste on your hosting platform. + +> **Note:** You must change the `baseURL` in the `hugo.toml` file. Otherwise, your site will not work properly. + +--- + +## 🔒 Guide to Staying Compliant + +### 🐞 Reporting Issues + +We use GitHub Issues as the official bug tracker for this Template. Please Search [existing issues](https://github.com/zeon-studio/hugoplate/issues). It’s possible someone has already reported the same problem. +If your problem or idea has not been addressed yet, feel free to [open a new issue](https://github.com/zeon-studio/hugoplate/issues). + +### 📝 License + +Copyright (c) 2023 - Present, Designed & Developed by [Zeon Studio](https://zeon.studio/) + +**Code License:** Released under the [MIT](https://github.comzeon-studio/hugoplate/blob/main/LICENSE) license. + +**Image license:** The images are only for demonstration purposes. They have their license, we don't have permission to share those images. diff --git a/app/requirements.txt b/app/requirements.txt deleted file mode 100644 index 73a380b..0000000 --- a/app/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -numpy==1.24.2 -pandas==1.3.5 -matplotlib -streamlit==1.27.0 -streamlit-aggrid==0.3.4 -jsonlines -awesome-streamlit==20200728.1 -beautifulsoup4==4.12.2 -lxml==5.0.0 -watchdog==4.0.0 diff --git a/app/resources/resources.jsonl b/app/resources/resources.jsonl deleted file mode 100644 index 1fc250c..0000000 --- a/app/resources/resources.jsonl +++ /dev/null @@ -1,265 +0,0 @@ -{"Name": "BigBench", "Description": "A collaborative benchmark of 100s of tasks, probing LLMs on a wide array of unique capabilities.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "6-2022", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2206.04615", "Website Link": "", "GitHub Link": "https://github.com/google/BIG-bench", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "BigBench Hard", "Description": "A challenging subset of 23 BigBench tasks where at time of release models did not outperform annotator performance.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "10-2022", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2210.09261", "Website Link": "", "GitHub Link": "https://github.com/suzgunmirac/BIG-Bench-Hard", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "BigCode Evaluation Harness", "Description": "A framework for the evaluation of code generation models, compiling many evaluation sets.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "CLIP benchmark", "Description": "Image classification, retrieval and captioning", "Modalities": ["Text", "Vision"], "Categories": ["Model Evaluation: Capabilities"], "Date": "4-2022", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/LAION-AI/CLIP_benchmark", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "DataComp eval suite", "Description": "38 image classification and retrieval downstream tasks", "Modalities": ["Text", "Vision"], "Categories": ["Model Evaluation: Capabilities"], "Date": "4-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2304.14108", "Website Link": "https://www.datacomp.ai/", "GitHub Link": "https://github.com/mlfoundations/datacomp#evaluation", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "HEIM", "Description": "A large suite of text-to-image evaluations. Useful for thorough capability analysis of these model types.", "Modalities": ["Text", "Vision"], "Categories": ["Model Evaluation: Capabilities"], "Date": "11-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://crfm.stanford.edu/heim/v1.1.0/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "HELM classic", "Description": "A large suite of benchmarks and metric types, to holistically evaluate many model qualities aside from performance on general tasks. Useful for a thorough comparison against other well known models.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "11-2022", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2211.09110", "Website Link": "https://crfm.stanford.edu/helm/latest/", "GitHub Link": "https://github.com/stanford-crfm/helm", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Hugging Face Leaderboards Collection", "Description": "A collection of unique leaderboards on Hugging Face for ranking models across modalities and tasks.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Capabilities"], "Date": "Frequently Updated", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "https://huggingface.co/blog?tag=leaderboard", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/collections/clefourrier/leaderboards-and-benchmarks-64f99d2e11e92ca5568a7cce", "Added By": "Original Authors"} -{"Name": "HumanEvalPack", "Description": "HumanEvalPack is a code evaluation benchmark across 6 languages and 3 tasks, extending OpenAI's HumanEval.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "8-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2308.07124", "Website Link": "", "GitHub Link": "https://github.com/bigcode-project/octopack", "HuggingFace Link": "https://huggingface.co/datasets/bigcode/humanevalpack", "Added By": "Original Authors"} -{"Name": "Lighteval", "Description": "Small, highly configurable LLM evaluation library, for fast experimentation and iteration.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/huggingface/lighteval", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "LM Evaluation Harness", "Description": "Orchestration framework for standardizing LM prompted evaluation, supporting hundreds of subtasks.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities", "Reproducibility"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/EleutherAI/lm-evaluation-harness", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "LMSys Chatbot Arena", "Description": "A leaderboard of models based on Elo ratings where humans or models select their preferred response between two anonymous models. Chatbot Arena, MT-Bench, and 5-shot MMLU are used as benchmarks. This resource provides a general purpose, and GPT-4 biased perspective into model capabilities.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "Frequently Updated", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2306.05685", "Website Link": "", "GitHub Link": "https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md", "HuggingFace Link": "https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard", "Added By": "Original Authors"} -{"Name": "MMBench", "Description": "A joint vision and text benchmark evaluating dozens of capabilities, using curated datasets and ChatGPT in the loop.", "Modalities": ["Text", "Vision"], "Categories": ["Model Evaluation: Capabilities"], "Date": "7-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2307.06281", "Website Link": "https://opencompass.org.cn/mmbench", "GitHub Link": "https://github.com/open-compass/MMBench", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "MME", "Description": "An evaluation benchmark for multimodal large language models with 14 manually curated subtasks, to avoid data leakage.", "Modalities": ["Text", "Vision"], "Categories": ["Model Evaluation: Capabilities"], "Date": "6-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2306.13394", "Website Link": "", "GitHub Link": "https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "MTEB", "Description": "The Massive Text Embedding Benchmark measures the quality of embeddings across 58 datasets and 112 languages for tasks related to retrieval, classification, clustering or semantic similarity.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "10-2022", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2210.07316", "Website Link": "", "GitHub Link": "https://github.com/embeddings-benchmark/mteb", "HuggingFace Link": "https://huggingface.co/spaces/mteb/leaderboard", "Added By": "Original Authors"} -{"Name": "OpenASR Leaderboard", "Description": "An automatic leaderboard ranking and evaluating speech recognition models on common benchmarks.", "Modalities": ["Speech"], "Categories": ["Model Evaluation: Capabilities"], "Date": "Frequently Updated", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/huggingface/open_asr_leaderboard", "HuggingFace Link": "https://huggingface.co/spaces/hf-audio/open_asr_leaderboard", "Added By": "Original Authors"} -{"Name": "OpenFlamingo eval suite", "Description": "VQA, captioning, classification", "Modalities": ["Text", "Vision"], "Categories": ["Model Evaluation: Capabilities"], "Date": "8-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2308.01390", "Website Link": "", "GitHub Link": "https://github.com/mlfoundations/open_flamingo/tree/main/open_flamingo/eval", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Open LLM Leaderboard", "Description": "A popular leaderboard on Hugging Face for ranking open LLMs on their knowledge, reasoning and math capabilities.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "Frequently Updated", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/open-llm-leaderboard", "Added By": "Original Authors"} -{"Name": "SWE Bench", "Description": "SWE-bench is a benchmark for evaluating large language models on real world software issues collected from GitHub. Given a codebase and an issue, a language model is tasked with generating a patch that resolves the described problem.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "10-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2310.06770", "Website Link": "https://www.swebench.com/", "GitHub Link": "https://github.com/princeton-nlp/SWE-bench", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The Edinburgh International Accents of English Corpus", "Description": "Benchmark dataset of diverse English varieties for evaluating automatic speech recognition models (typically trained and tested only on US English)", "Modalities": ["Speech"], "Categories": ["Model Evaluation: Capabilities"], "Date": "3-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2303.18110", "Website Link": "https://groups.inf.ed.ac.uk/edacc/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "HELM lite", "Description": "A lightweight subset of capability-centric benchmarks within HELM with comparisons to many prominent open and closed models.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "https://crfm.stanford.edu/2023/12/19/helm-lite.html", "Website Link": "https://crfm.stanford.edu/helm/lite/latest/#/", "GitHub Link": "https://github.com/stanford-crfm/helm", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "MMMU", "Description": "A benchmark to evaluate joint text and vision models on 11k examples spanning 30 college-level subject domains.", "Modalities": ["Text", "Vision"], "Categories": ["Model Evaluation: Capabilities"], "Date": "11-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2311.16502", "Website Link": "https://mmmu-benchmark.github.io/", "GitHub Link": "https://github.com/MMMU-Benchmark/MMMU", "HuggingFace Link": "https://huggingface.co/datasets/MMMU/MMMU", "Added By": "Original Authors"} -{"Name": "Anaconda", "Description": "An environment and dependency management tool.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Reproducibility"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.anaconda.com/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Colab Notebooks", "Description": "A tool to execute and share reproducible code snippets.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Reproducibility"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://colab.research.google.com/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Docker", "Description": "An environment and dependency management tool.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Reproducibility"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://docker-curriculum.com/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Jupyter Notebooks", "Description": "A tool to execute and share reproducible code snippets.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Reproducibility"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://jupyter.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Semver", "Description": "A widely used protcol for versioning to software, to ensure easy reproducibility.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Reproducibility"], "Date": "", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://semver.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Reforms", "Description": "Reporting Standards for ML-based Science.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Reproducibility"], "Date": "8-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2308.07832", "Website Link": "https://reforms.cs.princeton.edu/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "A Retrospective Datasheet for BookCorpus", "Description": "A third party datasheet for BookCorpus", "Modalities": ["Text"], "Categories": ["Data Auditing"], "Date": "5-2021", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2105.05241", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Provenance Initiative", "Description": "A large scale audit of 2000+ popular datasets in AI.", "Modalities": ["Text"], "Categories": ["Data Auditing"], "Date": "Frequently Updated", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2310.16787", "Website Link": "https://www.dataprovenance.org/", "GitHub Link": "https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection", "HuggingFace Link": "https://huggingface.co/DataProvenanceInitiative", "Added By": "Original Authors"} -{"Name": "Datasheet for the Pile", "Description": "A datasheet for the Pile", "Modalities": ["Text"], "Categories": ["Data Auditing"], "Date": "1-2022", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2201.07311", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "HaveIBeenTrained", "Description": "A combination search tool / opt out tool for LAION", "Modalities": ["Text", "Vision"], "Categories": ["Data Auditing", "Data Governance"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://haveibeentrained.com/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Into the LAIONs Den", "Description": "Auditing hateful content in text-to-vision datasets.", "Modalities": ["Text", "Vision"], "Categories": ["Data Auditing"], "Date": "9-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2311.03449", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Multimodal datasets: misogyny, pornography, and malignant stereotypes", "Description": "Auditing vision datasets for sensitive content.", "Modalities": ["Text", "Vision"], "Categories": ["Data Auditing"], "Date": "10-2021", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2110.01963", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "On Hate Scaling Laws For Data-Swamps", "Description": "Auditing text and vision datasets for systemic biases and hate.", "Modalities": ["Text", "Vision"], "Categories": ["Data Auditing"], "Date": "6-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2306.13141", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Quality at a Glance", "Description": "An audit of allegedly multilingual parallel text corpora.", "Modalities": ["Text"], "Categories": ["Data Auditing"], "Date": "3-2021", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2103.12028", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Training Data Transparency Blog", "Description": "A blog on transparency for training data in AI.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Auditing"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://huggingface.co/blog/yjernite/data-transparency", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Selection via Importance Resampling (DSIR)", "Description": "A tool for selecting data with a similar distribution to a target dataset", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "12-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2302.03169", "Website Link": "", "GitHub Link": "https://github.com/p-lambda/dsir", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "DataComp filtering", "Description": "Various quality filters", "Modalities": ["Text", "Vision"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "4-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2304.14108", "Website Link": "https://www.datacomp.ai/", "GitHub Link": "https://github.com/mlfoundations/datacomp/tree/main#baselines", "HuggingFace Link": "https://huggingface.co/datasets/mlfoundations/datacomp_1b", "Added By": "Original Authors"} -{"Name": "DataComp pre-filtering", "Description": "NSFW detection, dedup with eval datasets", "Modalities": ["Text", "Vision"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "4-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2304.14108", "Website Link": "https://www.datacomp.ai/", "GitHub Link": "https://github.com/mlfoundations/dataset2metadata", "HuggingFace Link": "https://huggingface.co/datasets/mlfoundations/datacomp_1b", "Added By": "Original Authors"} -{"Name": "Detoxify", "Description": "A python library designed to identify toxic language in comments. Functions in seven languages: English, Italian, French, Russian, Portuguese, Spanish, Turking.", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/unitaryai/detoxify", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Dolma's Toolkit", "Description": "A Python framework for defining Taggers that identify non-language text, language ID, PII, toxic text, and \"quality\" text. Includes reimplementation of heuristics used by Gopher and C4 for non-natural language.", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "8-2023", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/allenai/dolma", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "DoReMi", "Description": "A github repository for Domain Reweighting with Minimax Optimization", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "5-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2305.10429", "Website Link": "", "GitHub Link": "https://github.com/sangmichaelxie/doremi", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "fastText language classifier", "Description": "A tool for classifying the language of text", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "5-2023", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/facebook/fasttext-language-identification", "Added By": "Original Authors"} -{"Name": "FUN-LangID", "Description": "Frequently Used N-grams Language ID model, a character 4-gram model trained to recognize up to 1633 languages.", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "9-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/google-research/url-nlp/tree/main/fun-langid", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "GlotLID", "Description": "A model for identifying languages, with support for more than 1600 languages.", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2310.16248", "Website Link": "", "GitHub Link": "https://github.com/cisnlp/GlotLID", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Langdetect", "Description": "A tool to predict the language of text, used to filter out/in data from the desired languages", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "5-2021", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://pypi.org/project/langdetect/", "GitHub Link": "https://github.com/Mimino666/langdetect", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Lilac", "Description": "A python package for better understanding your data. Includes keyword and semantic search, as well as detection for PII, duplicates, and language.", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "9-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://www.lilacml.com/", "GitHub Link": "https://github.com/lilacai/lilac", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Online Data Mixing", "Description": "A github repository for efficient online data mixing", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "12-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2312.02406", "Website Link": "", "GitHub Link": "https://github.com/alon-albalak/online-data-mixing", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "OpenLID", "Description": "A model (and data used to train the model) for identifying 200+ languages.", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2305.13820", "Website Link": "", "GitHub Link": "https://github.com/laurieburchell/open-lid-dataset", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Roots data cleaning pipeline", "Description": "A pipeline for processing and improving quality of crowdsourced datasets", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "10-2022", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/01a_catalogue_cleaning_and_filtering", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "SpeechBrain’s Spoken language ID model", "Description": "Pre-trained spoken language identification model trained on VoxLingua107, dataset of audio sourced from YouTube for 107 languages", "Modalities": ["Speech"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "6-2021", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2106.04624", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/speechbrain/lang-id-voxlingua107-ecapa", "Added By": "Original Authors"} -{"Name": "The Pile processing scripts", "Description": "A series of scripts to replicate the Pile dataset. Includes filtering and cleaning for: language, profanity, deduplication, and test set decontamination.", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "12-2020", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/EleutherAI/the-pile/tree/master/processing_scripts", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "BigBench Canaries", "Description": "BigBench's \"Training on the Test Set\" Task provies guidance on using canaries to check if an evaluation set was trained on.", "Modalities": ["Text"], "Categories": ["Data Decontamination"], "Date": "10-2021", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/google/BIG-bench/blob/main/bigbench/benchmark_tasks/training_on_test_set/README.md#training-on-the-test-set", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Carper AI Decontamination Tool", "Description": "A repository, heavily based by the BigCode repository, to decontaminate evaluation sets from a text training set.", "Modalities": ["Text"], "Categories": ["Data Decontamination"], "Date": "1-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/CarperAI/decontamination/tree/main", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Portraits", "Description": "A tool to test for membership inference of popular datasets, like The Pile or The Stack, i.e. whether a model has seen certain data.", "Modalities": ["Text"], "Categories": ["Data Decontamination"], "Date": "3-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2303.03919", "Website Link": "https://dataportraits.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Detect Pretrain Data (Min-K Prob)", "Description": "Detect Pretrain Data (Min-K Prob)", "Modalities": ["Text"], "Categories": ["Data Decontamination"], "Date": "11-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2310.16789", "Website Link": "https://swj0419.github.io/detect-pretrain.github.io/", "GitHub Link": "https://github.com/swj0419/detect-pretrain-code", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Interpreting Canary Exposure", "Description": "An explanation on how to interpret canary exposure, including by relating it to membership inference attacks, and differential privacy.", "Modalities": ["Text"], "Categories": ["Data Decontamination"], "Date": "5-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2306.00133", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Proving Test Set Contamination in Black Box Language Models", "Description": "A paper that provides methods for provable guarantees of test set contamination in language models without access to pretraining data or model weights.", "Modalities": ["Text"], "Categories": ["Data Decontamination"], "Date": "10-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2310.17623", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Apricot", "Description": "apricot implements submodular optimization for the purpose of summarizing massive data sets into minimally redundant subsets that are still representative of the original data. These subsets are useful for both visualizing the modalities in the data (such as in the two data sets below) and for training accurate machine learning models with just a fraction of the examples and compute.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Deduplication"], "Date": "7-1905", "Primary Link": "GitHub", "Paper Link": "https://dl.acm.org/doi/abs/10.5555/3455716.3455877", "Website Link": "", "GitHub Link": "https://github.com/jmschrei/apricot", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Datacomp image dedup", "Description": "Data to deduplicate vision datasets for the Datacomp challenge.", "Modalities": ["Vision"], "Categories": ["Data Deduplication"], "Date": "8-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://www.datacomp.ai/", "GitHub Link": "https://github.com/mlfoundations/dataset2metadata", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Dolma Dedupe Tool", "Description": "Dolma's text deduplication tool for pretraining data", "Modalities": ["Text"], "Categories": ["Data Deduplication"], "Date": "10-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/allenai/dolma", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Google Text Deduplication", "Description": "A repository to deduplicate language model datasets. They release the ExactSubstr deduplication implementation (written in Rust) along with scripts to perform ExactSubstr deduplication and inspect the results (written in Python). They also release the document clusters resulting from running NearDup deduplication on C4, RealNews, LM1B, and Wiki-4B-en.", "Modalities": ["Text"], "Categories": ["Data Deduplication"], "Date": "7-2021", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2107.06499", "Website Link": "", "GitHub Link": "https://github.com/google-research/deduplicate-text-datasets", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "RedPajama-Data", "Description": "Tools for: exact deduplication with bloom filter, fuzzy deduplication with LSH, calculating quality scores", "Modalities": ["Text"], "Categories": ["Data Deduplication"], "Date": "10-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/togethercomputer/RedPajama-Data", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Pile", "Description": "A set of tools for deduplication with MinHashLSH", "Modalities": ["Text"], "Categories": ["Data Deduplication"], "Date": "5-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2101.00027", "Website Link": "", "GitHub Link": "https://huggingface.co/datasets/EleutherAI/pile-standard-pythia-preshuffled", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Cards Playbook", "Description": "A tool to create a Data Card that thoroughly documents a new dataset.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Documentation"], "Date": "6-2022", "Primary Link": "Webpage", "Paper Link": "https://dl.acm.org/doi/fullHtml/10.1145/3531146.3533231", "Website Link": "https://sites.research.google/datacardsplaybook/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Nutrition Labels", "Description": "A generic but thorough form of dataset documentation.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Documentation"], "Date": "2020", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/1805.03677", "Website Link": "https://datanutrition.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Provenance Attribution Card", "Description": "A repository to select datasets and generate a summary. It can also generate a bibtex to attribute all developers of the datasets.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Documentation"], "Date": "10-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2310.16787", "Website Link": "https://www.dataprovenance.org/", "GitHub Link": "https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Statements", "Description": "A data statement to thoroughly document a new dataset.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Documentation"], "Date": "2018", "Primary Link": "Paper", "Paper Link": "https://aclanthology.org/Q18-1041/", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Datasheets for Datasets", "Description": "A datasheet to thoroughly document a new dataset.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Documentation"], "Date": "3-2018", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/1803.09010", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Datasheets for Digital Cultural Heritage Datasets", "Description": "A datasheet specifically designed for digital cultural heritage datasets and their considerations.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Documentation"], "Date": "2023", "Primary Link": "Paper", "Paper Link": "https://cris.unibo.it/handle/11585/947893", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Governance in the Age of Large-Scale Data-Driven Language Technology", "Description": "A paper detailing the data governance decisions undertaken during BigScience's BLOOM project. ", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Governance"], "Date": "5-2022", "Primary Link": "Paper", "Paper Link": "https://dl.acm.org/doi/abs/10.1145/3531146.3534637", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/spaces/bigscience-data/roots-search", "Added By": "Original Authors"} -{"Name": "Reclaiming the Digital Commons: A Public Data Trust for Training Data", "Description": "A paper that argues for the creation of a public data trust for collective input into the creation of AI systems and analyzes the feasibility of such a data trust.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Governance"], "Date": "3-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2303.09001", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "BigCode Governance Card", "Description": "A report outlining governance questions, approaches, and tooling in the BigCode project, with a focus on Data governance", "Modalities": ["Text"], "Categories": ["Data Governance"], "Date": "11-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2312.03872", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "AmIinTheStack", "Description": "A tool to let software developers check whether their code was included in TheStack dataset and opt out of inclusion in future versions", "Modalities": ["Text"], "Categories": ["Data Governance"], "Date": "9-2022", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/spaces/bigcode/in-the-stack", "Added By": "Original Authors"} -{"Name": "StarPII: BigCode Pseudonymization Model", "Description": "A model trained on a new dataset of PII in code used for pseudonymization of a dataset prior to training", "Modalities": ["Text"], "Categories": ["Data Governance"], "Date": "4-2023", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/bigcode/starpii", "Added By": "Original Authors"} -{"Name": "French DPA Resource sheets on AI and GDPR", "Description": "A set of resource sheets focused on GDPR compliance covering legal basis for data collection, sharing, and best practices for handling personal data", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Data Governance"], "Date": "10-2023", "Primary Link": "Webpage", "Paper Link": "https://www.cnil.fr/en/ai-how-sheets", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "AI2 C4 Search Tool", "Description": "A search tool that lets users to execute full-text queries to search Google's C4 Dataset.", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "7-1905", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://c4-search.apps.allenai.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Finder", "Description": "A tool to help build search over academic datasets given a natural language description of the idea.", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "5-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2305.16636", "Website Link": "", "GitHub Link": "https://github.com/viswavi/datafinder", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Provenance Explorer", "Description": "An explorer tool for selecting, filtering, and visualizing popular finetuning, instruction, and alignment training datasets from Hugging Face, based on their metadata such as source, license, languages, tasks, topics, among other properties.", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2310.16787", "Website Link": "https://www.dataprovenance.org/", "GitHub Link": "https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection", "HuggingFace Link": "https://huggingface.co/DataProvenanceInitiative", "Added By": "Original Authors"} -{"Name": "GAIA Search Tool", "Description": "A search tool over C4, the Pile, ROOTS, and the text captions of LAION, developed with Pyserini (https://github.com/castorini/pyserini).", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "6-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2306.01481", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/spaces/spacerini/gaia", "Added By": "Original Authors"} -{"Name": "Hugging Face Data Measurements Tool", "Description": "A tool to analyze, measure, and compare properties of text finetuning data, including their distributional statistics, lengths, and vocabularies.", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "7-1905", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/spaces/huggingface/data-measurements-tool", "Added By": "Original Authors"} -{"Name": "Know your data", "Description": "A tool for exploring over 70 vision datasets", "Modalities": ["Vision"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "5-2021", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://knowyourdata-tfds.withgoogle.com/", "GitHub Link": "https://github.com/PAIR-code/knowyourdata", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "LAION search", "Description": "Nearest neighbor search based on CLIP embeddings", "Modalities": ["Text", "Vision"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "3-2022", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://rom1504.github.io/clip-retrieval/", "GitHub Link": "https://github.com/rom1504/clip-retrieval", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "NVIDIA Speech Data Explorer", "Description": "Tool for exploring speech data", "Modalities": ["Speech"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/tools/speech_data_explorer.html", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "ROOTS Search Tool", "Description": "A tool, based on a BM25 index, to search over text for each language or group of languages included in the ROOTS pretraining dataset.", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "7-1905", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/spaces/bigscience-data/roots-search", "Added By": "Original Authors"} -{"Name": "What's In My Big Data?", "Description": "A platform for analyzing large text datasets at scale", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "10-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2310.20707", "Website Link": "https://wimbd.apps.allenai.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "WIMBD", "Description": "A dataset analysis tool to count, search, and compare attributes across several massive pretraining corpora at scale, including C4, The Pile, and RedPajama.", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "11-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2310.20707", "Website Link": "https://wimbd.apps.allenai.org/", "GitHub Link": "https://github.com/allenai/wimbd", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Everything about Distributed Training and Efficient Finetuning", "Description": "A rundown and crash course in distributed training for deep learning, with an eye toward LLM finetuning and current useful tools and resources. Provides a good overview of the various (distributed) training strategies for efficient and scalable training.", "Modalities": ["Text"], "Categories": ["Model Training: Educational Resources"], "Date": "10-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://sumanthrh.com/post/distributed-and-efficient-finetuning/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Machine Learning Engineering Online Book", "Description": "An \"online textbook\" and resource collection on ML engineering at scale, ranging from debugging distributed systems, parallelism strategies, effective use of large HPC clusters, and chronicles of past large-scale training runs with lessons learned.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Training: Educational Resources"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/stas00/ml-engineering", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "nanoGPT", "Description": "A minimal, stripped-down training codebase for teaching purposes and easily-hackable yet performant small-scale training.", "Modalities": ["Text"], "Categories": ["Model Training: Educational Resources"], "Date": "12-2022", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/karpathy/nanoGPT", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The EleutherAI Model Training Cookbook", "Description": "A set of resources on how to train large scale AI systems", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Training: Educational Resources"], "Date": "12-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/EleutherAI/cookbook", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Transformer Inference Arithmetic", "Description": "A blog post on the inference costs of transformer-based LMs. Useful for providing more insight into deep learning accelerators and inference-relevant decisions to make when training a model.", "Modalities": ["Text"], "Categories": ["Model Training: Educational Resources"], "Date": "3-2022", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://kipp.ly/transformer-inference-arithmetic/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Transformer Math 101", "Description": "An introductory blog post on training costs of LLMs, going over useful formulas and considerations from a high to low level", "Modalities": ["Text"], "Categories": ["Model Training: Educational Resources"], "Date": "4-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://blog.eleuther.ai/transformer-math/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Azure Emissions Impact Dashboard", "Description": "Monitoring the environmental impact of training machine learning models on Azure", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Environmental Impact"], "Date": "10-2021", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.microsoft.com/en-us/sustainability/emissions-impact-dashboard", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Carbontracker", "Description": "carbontracker is a tool for tracking and predicting the energy consumption and carbon footprint of training deep learning models as described in Anthony et al. (2020).", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Environmental Impact"], "Date": "7-2020", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2007.03051", "Website Link": "", "GitHub Link": "https://github.com/lfwa/carbontracker", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "CodeCarbon", "Description": "Estimate and track carbon emissions from your computer, quantify and analyze their impact.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Environmental Impact"], "Date": "11-2020", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://mlco2.github.io/codecarbon/", "GitHub Link": "https://github.com/mlco2/codecarbon", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model", "Description": "A comprehensive account of the broader environmental impact of the BLOOM language model.", "Modalities": ["Text"], "Categories": ["Environmental Impact"], "Date": "6-2023", "Primary Link": "Paper", "Paper Link": "https://jmlr.org/papers/v24/23-0069.html", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Experiment Impact Tracker", "Description": "The experiment-impact-tracker is meant to be a simple drop-in method to track energy usage, carbon emissions, and compute utilization of your system. Currently, on Linux systems with Intel chips (that support the RAPL or powergadget interfaces) and NVIDIA GPUs, we record: power draw from CPU and GPU, hardware information, python package versions, estimated carbon emissions information, etc. In California we even support realtime carbon emission information by querying caiso.com!", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Environmental Impact"], "Date": "1-2020", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2002.05651", "Website Link": "", "GitHub Link": "https://github.com/Breakend/experiment-impact-tracker", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Google Cloud Carbon Footprint Measurement", "Description": "Tracking the emissions of using Google's cloud compute resources", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Environmental Impact"], "Date": "10-2021", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://cloud.google.com/carbon-footprint?hl=en", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Making AI Less \"Thirsty\"", "Description": "Uncovering and Addressing the Secret Water Footprint of AI Models, and estimating water usage for training and deploying LLMs.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Environmental Impact"], "Date": "4-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2304.03271", "Website Link": "", "GitHub Link": "https://github.com/Ren-Research/Making-AI-Less-Thirsty", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "ML CO2 Impact", "Description": "A tool for estimating carbon impacts of ML training", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Environmental Impact"], "Date": "10-2019", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/1910.09700", "Website Link": "https://mlco2.github.io/impact/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Scaling Laws for Neural Language Models", "Description": "Provide scaling laws to determine the optimal allocation of a fixed compute budget.", "Modalities": ["Text"], "Categories": ["Environmental Impact", "Model Training: Efficiency & Resource Allocation"], "Date": "1-2020", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2001.08361", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Training Compute-Optimal Large Language Models", "Description": "Provides details on the optimal model size and number of tokens for training a transformer-based language model in a given computational budget.", "Modalities": ["Text"], "Categories": ["Environmental Impact"], "Date": "3-2022", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2203.15556", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "AI4Bhārat Indic NLP", "Description": "A repository of Indian language text and speech resources, including datasets.", "Modalities": ["Text", "Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://ai4bharat.iitm.ac.in/", "GitHub Link": "https://github.com/AI4Bharat", "HuggingFace Link": "https://huggingface.co/ai4bharat", "Added By": "Original Authors"} -{"Name": "Arabic NLP Data Catalogue", "Description": "A catalogue of hundreds of Arabic text and speech finetuning datasets, regularly updated.", "Modalities": ["Text", "Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://arbml.github.io/masader/", "GitHub Link": "https://github.com/ARBML", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "CHiME-5", "Description": "Speaker Diarization dataset comprising over 50 hours of conversational speech recordings collected from twenty real dinner parties that have taken place in real homes", "Modalities": ["Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "7-1905", "Primary Link": "Webpage", "Paper Link": "https://licensing.sheffield.ac.uk/product/chime5/print", "Website Link": "https://licensing.sheffield.ac.uk/product/chime5", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Data Provenance Collection", "Description": "A repository and explorer tool for selecting popular finetuning, instruction, and alignment training datasets from Hugging Face, based on data provenance and characteristics criteria.", "Modalities": ["Text"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2310.16787", "Website Link": "https://www.dataprovenance.org/", "GitHub Link": "https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection", "HuggingFace Link": "https://huggingface.co/DataProvenanceInitiative", "Added By": "Original Authors"} -{"Name": "ImageNet", "Description": "An image classification dataset with 1.3M samples and 1000 classes", "Modalities": ["Vision"], "Categories": ["Finetuning Data Catalogs"], "Date": "6-2009", "Primary Link": "Webpage", "Paper Link": "https://ieeexplore.ieee.org/abstract/document/5206848", "Website Link": "https://www.image-net.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Indonesian NLP Data Catalogue", "Description": "A respository of hundreds of Indonesian language datasets.", "Modalities": ["Text", "Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://indonlp.github.io/nusa-catalogue/", "GitHub Link": "https://github.com/IndoNLP/nusa-crowd", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Lanfrica", "Description": "An online catalogue that provides links to African language resources (papers and datasets) in both texts and speech", "Modalities": ["Text", "Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://lanfrica.com/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Masakhane NLP", "Description": "A repository of African language text and speech resources, including datasets.", "Modalities": ["Text", "Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://www.masakhane.io/", "GitHub Link": "https://github.com/masakhane-io", "HuggingFace Link": "https://huggingface.co/masakhane", "Added By": "Original Authors"} -{"Name": "MS COCO", "Description": "Object detection, segmentation, captioning and retrieval dataset", "Modalities": ["Text", "Vision"], "Categories": ["Finetuning Data Catalogs"], "Date": "5-2014", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/1405.0312", "Website Link": "https://cocodataset.org/#home", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "OpenSLR", "Description": "A collection of user-contributed datasets for various speech processing tasks", "Modalities": ["Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.openslr.org/resources.php", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "SEACrowd", "Description": "A repository of hundreds of South East Asian language datasets.", "Modalities": ["Text", "Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://seacrowd.github.io/seacrowd-catalogue/", "GitHub Link": "https://github.com/SEACrowd", "HuggingFace Link": "https://huggingface.co/NusaCrowd", "Added By": "Original Authors"} -{"Name": "VoxCeleb", "Description": "Speaker Identification dataset comprising of YouTube interviews from thousands of celebrities", "Modalities": ["Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "6-2017", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/1706.08612", "Website Link": "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "VoxLingua107", "Description": "Spoken language identification dataset created using audio extracted from YouTube videos retrieved using language-specific search phrases", "Modalities": ["Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "11-2020", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2011.12998", "Website Link": "https://bark.phon.ioc.ee/voxlingua107/", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/speechbrain/lang-id-voxlingua107-ecapa", "Added By": "Original Authors"} -{"Name": "Zenodo AfricaNLP Community", "Description": "An online catalogue that provides African language resources (data and models) in both texts and speech", "Modalities": ["Text", "Speech"], "Categories": ["Finetuning Data Catalogs"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://zenodo.org/communities/africanlp", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Axolotl", "Description": "A repository for chat- or instruction-tuning language models, including through full fine-tuning, LoRA, QLoRA, and GPTQ.", "Modalities": ["Text"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/OpenAccess-AI-Collective/axolotl", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "BLIP-2", "Description": "Fine-tuned LLMs on multimodal data using a projection layer", "Modalities": ["Text", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "1-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2301.12597", "Website Link": "", "GitHub Link": "https://github.com/salesforce/LAVIS/tree/main/projects/blip2", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "LLaMA-Adapter", "Description": "Fine-tuned LLMs on multimodal data using adapters", "Modalities": ["Text", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "3-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2304.15010", "Website Link": "", "GitHub Link": "https://github.com/OpenGVLab/LLaMA-Adapter", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "LLaMA-Factory", "Description": "A framework for efficiently fine-tuning LLMs using cutting-edge algorithms with a user-friendly web UI", "Modalities": ["Text"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2403.13372", "Website Link": "", "GitHub Link": "https://github.com/hiyouga/LLaMA-Factory", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "LLaVA", "Description": "Fine-tuned LLMs on multimodal data using a projection layer", "Modalities": ["Text", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "4-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2310.03744", "Website Link": "https://llava-vl.github.io/", "GitHub Link": "https://github.com/haotian-liu/LLaVA", "HuggingFace Link": "https://huggingface.co/spaces/badayvedat/LLaVA", "Added By": "Original Authors"} -{"Name": "MiniGPT4", "Description": "Fine-tuned LLMs on multimodal data using a projection layer", "Modalities": ["Text", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "4-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2304.10592", "Website Link": "https://minigpt-4.github.io/", "GitHub Link": "https://github.com/Vision-CAIR/MiniGPT-4", "HuggingFace Link": "https://huggingface.co/spaces/Vision-CAIR/minigpt4", "Added By": "Original Authors"} -{"Name": "OpenFlamingo", "Description": "Open source implementation of Flamingo", "Modalities": ["Text", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "3-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2308.01390", "Website Link": "https://laion.ai/blog/open-flamingo-v2/", "GitHub Link": "https://github.com/mlfoundations/open_flamingo", "HuggingFace Link": "https://huggingface.co/openflamingo", "Added By": "Original Authors"} -{"Name": "Otter", "Description": "Multimodal models with Flamingo architecture", "Modalities": ["Text", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "4-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2311.04219", "Website Link": "", "GitHub Link": "https://github.com/Luodian/Otter", "HuggingFace Link": "https://huggingface.co/spaces/Otter-AI/OtterHD-Demo", "Added By": "Original Authors"} -{"Name": "peft", "Description": "A library for doing parameter efficient finetuning", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/huggingface/peft", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "trl", "Description": "A library for doing RLHF on LLMs.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/huggingface/trl", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "trlX", "Description": "A library for doing RLHF on LLMs.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Training: Finetuning Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "https://aclanthology.org/2023.emnlp-main.530/", "Website Link": "https://trlx.readthedocs.io/en/latest/", "GitHub Link": "https://github.com/CarperAI/trlx", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Levanter", "Description": "Levanter is a framework for training large language models (LLMs) and other foundation models that strives for legibility, scalability, and reproducibility:", "Modalities": ["Text"], "Categories": ["Model Training: Finetuning Repositories", "Model Training: Pretraining Repositories"], "Date": "6-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://crfm.stanford.edu/2023/06/16/levanter-1_0-release.html", "GitHub Link": "https://github.com/stanford-crfm/levanter", "HuggingFace Link": "https://huggingface.co/stanford-crfm", "Added By": "Original Authors"} -{"Name": "AI Licensing Can’t Balance “Open” with “Responsible”", "Description": "A blog post by an IP lawyer arguing against responsible use licensing", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "7-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://katedowninglaw.com/2023/07/13/ai-licensing-cant-balance-open-with-responsible/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "AI Pubs Open RAIL-M License", "Description": "Template for a responsible AI model license where the model is intended for research use. Use restrictions relate to discrimination, transparency, and violating the law", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "3-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.licenses.ai/ai-pubs-open-railm-vz1", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "AI2 ImpACT-LR License", "Description": "License for low risk AI artifacts (data and models) that allows for distribution of the artifact and its derivatives. Use restrictions include weapons development and military surveillance", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "7-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://allenai.org/licenses/impact-lr", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "AI2 ImpACT-MR License", "Description": "License for medium risk AI artifacts (data and models) that does not allows for distribution of the artifact but does allow for distribution of its derivatives. Use restrictions include weapons development and military surveillance", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "7-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://allenai.org/licenses/impact-mr", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Apache 2.0 License", "Description": "The most common open-source license for model weights", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "1-2004", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.apache.org/licenses/LICENSE-2.0", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Behavioral Use Licensing for Responsible AI", "Description": "A paper that provides a theoretical framework for licenses inteded for open models with use restrictions", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "6-2022", "Primary Link": "Paper", "Paper Link": "https://dl.acm.org/doi/10.1145/3531146.3533143", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "BigCode Open RAIL-M License", "Description": "Template for a responsible AI model license. Use restrictions include generation and dissemination of malware", "Modalities": ["Text"], "Categories": ["License Selection"], "Date": "5-2023", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement", "Added By": "Original Authors"} -{"Name": "BigScience Open RAIL-M License", "Description": "Template for a responsible AI model license. Use restrictions include defamation, disinformation, and discrimination", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "8-2022", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://static1.squarespace.com/static/5c2a6d5c45776e85d1482a7e/t/6308bb4bba3a2a045b72a4b0/1661516619868/BigScience+Open+RAIL-M+License.pdf", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Choose an open source license", "Description": "A guide for choosing among open source licenses that includes general selection criteria and explanations for software licenses", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://choosealicense.com/", "GitHub Link": "https://github.com/github/choosealicense.com/tree/gh-pages", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Create Commons License Chooser", "Description": "A guide for choosing among Creative Commons licenses with an explanation of how they function", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://chooser-beta.creativecommons.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Legal Playbook For Natural Language Processing Researchers", "Description": "This playbook is a legal research resource for various activities related to data gathering, data governance, and disposition of an AI model available as a public resource.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "7-1905", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://bigscience.huggingface.co/blog/legal-playbook-for-natural-language-processing-researchers", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Licensing is neither feasible nor effective for addressing AI risks", "Description": "Argues that licensing is not the correct way to address risks with AI systems", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "6-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.aisnakeoil.com/p/licensing-is-neither-feasible-nor", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Open RAIL-S License", "Description": "Template for a responsible AI source code license. Use restrictions relate to surveillance, synthetic media, healthcare and the criminal legal system", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "11-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.licenses.ai/source-code-license", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Primer on AI2 ImpACT Licenses", "Description": "A post by AI2 describing when and why an organization should use a specific ImpACT license", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "7-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://allenai.org/impact-license", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The Open Source Definition", "Description": "The definition of an \"open source\" license", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "2-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://opensource.org/osd/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The Turning Way, Licensing", "Description": "A guide to reproducible research and licensing", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://the-turing-way.netlify.app/reproducible-research/licensing", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "What is Free Software?", "Description": "A philosophical argument for why free software licenses are important", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["License Selection"], "Date": "2-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.gnu.org/philosophy/free-sw.en.html", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Ecosystem Cards", "Description": "Ecosystem Graphs centralize information about models and their impact in the broader ecosystem. ", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Documentation"], "Date": "3-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2303.15772", "Website Link": "https://hai.stanford.edu/news/ecosystem-graphs-social-footprint-foundation-models", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Foundation Model Transparency Index", "Description": "An index to measure the transparency of a foundation model with respect to its inputs, development, and downstream uses or policies.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Documentation"], "Date": "10-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2310.12941", "Website Link": "https://crfm.stanford.edu/fmti/", "GitHub Link": "https://github.com/stanford-crfm/fmti", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Model Card Resources", "Description": "A release of several resources surrounding model cards, including templates and tools for easy documentation creation, and how these are frequently used in practice.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Documentation"], "Date": "12-2022", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://huggingface.co/blog/model-cards", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Model Cards", "Description": "A standard for reporting and documenting machine learning models, for promoting and easing transparent and open model development or reporting.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Documentation"], "Date": "10-2018", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/1810.03993", "Website Link": "https://huggingface.co/spaces/huggingface/Model_Cards_Writing_Tool", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Hugging Face ML Research Release Toolkit ", "Description": "A new researcher guide to releasing model or data resources, documenting the research and Hugging Face objects.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Documentation"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://docs.google.com/document/d/1EOxyZ11piIIRLDlhofX8nfnU0mHCU-TZ3EU4tx5g9aE/edit#heading=h.8zrjwmlee7ge", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "C4", "Description": "An English, cleaned version of Common Crawl's web crawl corpus (https://commoncrawl.org).", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "4-2019", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/1910.10683", "Website Link": "https://commoncrawl.org", "GitHub Link": "https://github.com/google-research/text-to-text-transfer-transformer#c4", "HuggingFace Link": "https://huggingface.co/datasets/allenai/c4", "Added By": "Original Authors"} -{"Name": "Common Voice", "Description": "28k hours [as of 11/2023] of crowd-sourced read speech from 100+ languages", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "11-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://commonvoice.mozilla.org/en/datasets", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "CulturaX", "Description": "A pertaining dataset of 16T tokens, covering 167 languages, cleaned, deduplicated, and refined. Combines mC4 into 2020, with OSCAR project data up to 2023.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "9-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2309.09400", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/uonlp/CulturaX", "Added By": "Original Authors"} -{"Name": "DataComp-1B and CommonPool-13B", "Description": "A large pool of 13B image-text pairs from CommonCrawl and a curated 1B subset", "Modalities": ["Text", "Vision"], "Categories": ["Pretraining Data Sources"], "Date": "4-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2304.14108", "Website Link": "https://www.datacomp.ai/", "GitHub Link": "https://github.com/mlfoundations/datacomp", "HuggingFace Link": "https://huggingface.co/datasets/mlfoundations/datacomp_1b", "Added By": "Original Authors"} -{"Name": "Dolma", "Description": "A pretraining dataset of 3 trillion tokens from a diverse mix of web content, academic publications, code, books, and encyclopedic materials.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "8-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2402.00159", "Website Link": "", "GitHub Link": "https://github.com/allenai/dolma", "HuggingFace Link": "https://huggingface.co/datasets/allenai/dolma", "Added By": "Original Authors"} -{"Name": "GigaSpeech", "Description": "40k hours (10k transcribed) multi-domain English speech corpus", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "7-1905", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2106.06909", "Website Link": "", "GitHub Link": "https://github.com/SpeechColab/GigaSpeech", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Golos", "Description": "1,240 hours of crowd-sourced Russian speech", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "6-2021", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2106.10161", "Website Link": "https://www.openslr.org/114/", "GitHub Link": "https://github.com/sberdevices/golos", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "IndicCorp v2", "Description": "A multilingual pre-training corpus for 24 Indian languages", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "5-2023", "Primary Link": "GitHub", "Paper Link": "https://aclanthology.org/2023.acl-long.693/", "Website Link": "", "GitHub Link": "https://github.com/AI4Bharat/IndicBERT/tree/main#indiccorp-v2", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "IndicSUPERB", "Description": "1,684 hour crowd-sourced corpus of 12 Indian languages", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "8-2022", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2208.11761", "Website Link": "https://ai4bharat.iitm.ac.in/indicsuperb/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Libri-Light", "Description": "60k hour read English speech from LibriVox audiobooks", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "12-2019", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/1912.07875", "Website Link": "", "GitHub Link": "https://github.com/facebookresearch/libri-light", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "LibriSpeech", "Description": "960 hour read English speech from LibriVox audiobooks", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "7-1905", "Primary Link": "Webpage", "Paper Link": "http://www.danielpovey.com/files/2015_icassp_librispeech.pdf", "Website Link": "https://www.openslr.org/12/", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/librispeech_asr", "Added By": "Original Authors"} -{"Name": "MADLAD-400", "Description": "A manually audited, general domain 3T token monolingual dataset based on CommonCrawl, spanning 419 languages.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "9-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2309.04662", "Website Link": "", "GitHub Link": "https://github.com/google-research/google-research/tree/master/madlad_400", "HuggingFace Link": "https://huggingface.co/datasets/allenai/MADLAD-400", "Added By": "Original Authors"} -{"Name": "mC4", "Description": "The fully multilingual, cleaned version of Common Crawl's web crawl corpus (https://commoncrawl.org).", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "4-2019", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/1910.10683", "Website Link": "https://commoncrawl.org", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/mc4", "Added By": "Original Authors"} -{"Name": "MMC4", "Description": "Interleaved image-text data from Common Crawl (570M images, 43B tokens)", "Modalities": ["Text", "Vision"], "Categories": ["Pretraining Data Sources"], "Date": "4-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2304.06939", "Website Link": "", "GitHub Link": "https://github.com/allenai/mmc4", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "OBELICS", "Description": "Interleaved image-text data from Common Crawl (353 M images, 115B tokens)", "Modalities": ["Text", "Vision"], "Categories": ["Pretraining Data Sources"], "Date": "6-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2306.16527", "Website Link": "https://huggingface.co/blog/idefics", "GitHub Link": "https://github.com/huggingface/OBELICS", "HuggingFace Link": "https://huggingface.co/datasets/HuggingFaceM4/OBELICS", "Added By": "Original Authors"} -{"Name": "OLC", "Description": "The Open License Corpus is a 228B token corpus of permissively-licensed, primarily English text data for pretraining.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "8-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2308.04430", "Website Link": "", "GitHub Link": "https://github.com/kernelmachine/silo-lm#download-data", "HuggingFace Link": "https://huggingface.co/datasets/kernelmachine/open-license-corpus", "Added By": "Original Authors"} -{"Name": "OpenWebMath", "Description": "A dataset containing the majority of the high-quality, mathematical text from the internet. It is filtered and extracted from over 200B HTML files on Common Crawl down to a set of 6.3 million documents containing a total of 14.7B tokens.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "10-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2310.06786", "Website Link": "", "GitHub Link": "https://github.com/keirp/OpenWebMath", "HuggingFace Link": "https://huggingface.co/datasets/open-web-math/open-web-math", "Added By": "Original Authors"} -{"Name": "OPUS", "Description": "The Open Parallel Corpus is a massive collection of translated text pairs from the web.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://opus.nlpl.eu/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "OSCAR", "Description": "The Open Super-large Crawled Aggregated coRpus provides web-based multilingual datasets across 166 languages.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "https://aclanthology.org/2022.wnut-1.23/", "Website Link": "https://oscar-project.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "peS2o", "Description": "A collection of ~40M creative open-access academic papers, cleaned, filtered, and formatted for pre-training of language models, originally derived from the Semantic Scholar Open Research Corpus (S2ORC).", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "1-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/1911.02782", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/allenai/peS2o", "Added By": "Original Authors"} -{"Name": "Pile of Law", "Description": "An open-source, English dataset with ∼256GB of legal and administrative data, covering court opinions, contracts, administrative rules, and legislative records.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "11-2022", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2207.00220", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/pile-of-law/pile-of-law", "Added By": "Original Authors"} -{"Name": "RedPajama v2", "Description": "A pretraining dataset of 30 trillion filtered and deduplicated tokens (100+ trillions raw) from 84 CommonCrawl dumps covering 5 languages, along with 40+ pre-computed data quality annotations that can be used for further filtering and weighting.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "10-2023", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "https://www.together.ai/blog/redpajama-data-v2", "GitHub Link": "https://github.com/togethercomputer/RedPajama-Data", "HuggingFace Link": "https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2", "Added By": "Original Authors"} -{"Name": "ROOTS", "Description": "A massive multilingual pretraining corpus from BigScience, comprised of 1.6TB of text spanning 59 languages. It is a mix of OSCAR (https://oscar-project.org/) and the datasets found in the BigScience Catalogue (https://huggingface.co/spaces/bigscience/SourcingCatalog).", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "5-2022", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2303.03915", "Website Link": "https://bigscience.huggingface.co/", "GitHub Link": "https://github.com/bigscience-workshop/bigscience/tree/master/data", "HuggingFace Link": "https://huggingface.co/bigscience-data", "Added By": "Original Authors"} -{"Name": "Samrómur", "Description": "2,200 hour crowd-sourced corpus of Icelandic speech", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "7-1905", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.openslr.org/128/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Shrutilipi", "Description": "6,400 hour corpus of TV/Radio broadcasts from 12 Indian languages", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "8-2022", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2208.12666", "Website Link": "https://ai4bharat.iitm.ac.in/shrutilipi/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The People’s Speech", "Description": "30k hour conversational English dataset", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "11-2021", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2111.09344", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/MLCommons/peoples_speech", "Added By": "Original Authors"} -{"Name": "The Pile", "Description": "An 825GB English pretraining corpus that mixes portions of common crawl with 22 smaller, high-quality datasets combined together.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "12-2020", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2101.00027", "Website Link": "https://pile.eleuther.ai/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The Proof Pile 2", "Description": "The Proof-Pile-2 is a 55 billion token dataset of mathematical and scientific documents.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "9-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2310.10631", "Website Link": "https://blog.eleuther.ai/llemma/", "GitHub Link": "https://github.com/EleutherAI/math-lm", "HuggingFace Link": "https://huggingface.co/datasets/EleutherAI/proof-pile-2", "Added By": "Original Authors"} -{"Name": "The RefinedWeb", "Description": "An English-only, web-only, deduplicated pretraining dataset of five trillion tokens.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "6-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2306.01116", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/tiiuae/falcon-refinedweb", "Added By": "Original Authors"} -{"Name": "The Stack", "Description": "The Stack is a 6TB, permissively-licensed pretraining dataset from active GitHub repositories covering 358 programming languages.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "11-2022", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2211.15533", "Website Link": "https://www.bigcode-project.org/docs/about/the-stack/#datasets-and-data-governance-tools-released-by-bigcode", "GitHub Link": "https://github.com/bigcode-project/bigcode-dataset", "HuggingFace Link": "https://huggingface.co/datasets/bigcode/the-stack", "Added By": "Original Authors"} -{"Name": "VoxPopuli", "Description": "400k hours of unlabelled speech from 23 languages of the European parliament", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "1-2021", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2101.00390", "Website Link": "", "GitHub Link": "https://github.com/facebookresearch/voxpopuli", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "WebVid-10M", "Description": "10M videos with captions", "Modalities": ["Text", "Vision"], "Categories": ["Pretraining Data Sources"], "Date": "4-2021", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2104.00650", "Website Link": "https://maxbain.com/webvid-dataset/", "GitHub Link": "https://github.com/m-bain/webvid", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "WenetSpeech", "Description": "22.4k hour multi-domain corpus of Mandarin", "Modalities": ["Speech"], "Categories": ["Pretraining Data Sources"], "Date": "10-2021", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2110.03370", "Website Link": "https://www.openslr.org/121/", "GitHub Link": "https://github.com/wenet-e2e/WenetSpeech", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "WURA", "Description": "A manually audited multilingual pre-training corpus (document-level dataset) for 16 African languages and four high-resource languages widely spoken in Africa (English, French, Arabic and Portuguese)", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "11-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://aclanthology.org/2023.emnlp-main.11/", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/castorini/wura", "Added By": "Original Authors"} -{"Name": "WebDatasets", "Description": "A dataset format for high-performance streaming of data. Especially useful for modalities other than language that are more I/O intensive for training', such as images, video, or audio.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Pretraining Data Sources"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/webdataset/webdataset", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Multi Legal Pile", "Description": "A large-scale multilingual legal dataset and superset of the Pile of Law, suited for pretraining language models. It spans over 24 languages and five legal text types.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "6-2023", "Primary Link": "Hugging Face object", "Paper Link": "https://arxiv.org/abs/2306.02069", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/joelniklaus/Multi_Legal_Pile", "Added By": "Original Authors"} -{"Name": "GPT-NeoX", "Description": "A library for training large language models, built off Megatron-DeepSpeed and Megatron-LM with an easier user interface. Used at massive scale on a variety of clusters and hardware setups.", "Modalities": ["Text"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/EleutherAI/gpt-neox", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Kosmos-2", "Description": "For training multimodal models with CLIP backbones.", "Modalities": ["Text", "Vision"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "6-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2306.14824", "Website Link": "", "GitHub Link": "https://github.com/microsoft/unilm/tree/master/kosmos-2", "HuggingFace Link": "https://huggingface.co/spaces/ydshieh/Kosmos-2", "Added By": "Original Authors"} -{"Name": "Lhotse", "Description": "Python library for handling speech data in machine learning projects", "Modalities": ["Speech"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "10-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://github.com/lhotse-speech/lhotse", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Megatron-DeepSpeed", "Description": "A library for training large language models, built off of Megatron-LM but extended by Microsoft to support features of their DeepSpeed library.", "Modalities": ["Text"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/microsoft/Megatron-DeepSpeed", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Megatron-LM", "Description": "One of the earliest open-source pretraining codebases for large language models. Still updated and has been used for a number of landmark distributed training and parallelism research papers by NVIDIA.", "Modalities": ["Text"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/NVIDIA/Megatron-LM", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "OpenCLIP", "Description": "Supports training and inference for over 100 CLIP models", "Modalities": ["Text", "Vision"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "9-2021", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/mlfoundations/open_clip", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "OpenLM", "Description": "OpenLM is a minimal language modeling repository, aimed to facilitate research on medium sized LMs. They have verified the performance of OpenLM up to 7B parameters and 256 GPUs. They only depend only on PyTorch, XFormers, or Triton.", "Modalities": ["Text"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/mlfoundations/open_lm", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Pytorch Image Models (timm)", "Description": "Hub for models, scripts and pre-trained weights for image classification models.", "Modalities": ["Vision"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "5-2019", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/huggingface/pytorch-image-models", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Stable Audio Tools", "Description": "A codebase for distributed training of generative audio models.", "Modalities": ["Speech"], "Categories": ["Model Training: Pretraining Repositories"], "Date": "Frequently Updated", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/Stability-AI/stable-audio-tools", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Bias Benchmark for QA (BBQ)", "Description": "A dataset of question-sets constructed by the authors that highlight attested social biases against people belonging to protected classes along nine different social dimensions relevant for U.S. English-speaking contexts.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "10-2021", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2110.08193", "Website Link": "", "GitHub Link": "https://github.com/nyu-mll/BBQ", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Crossmodal-3600", "Description": "Image captioning evaluation with geographically diverse images in 36 languages", "Modalities": ["Text", "Vision"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "5-2022", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2205.12522", "Website Link": "", "GitHub Link": "https://google.github.io/crossmodal-3600/", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "FactualityPrompt", "Description": "A benchmark to measure factuality in language models.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "6-2022", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2206.04624", "Website Link": "", "GitHub Link": "https://github.com/nayeon7lee/FactualityPrompt", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "From text to talk", "Description": "Harnessing conversational corpora for humane and diversity-aware language technology. They show how interactional data from 63 languages (26 families) harbours insights about turn-taking, timing, sequential structure and social action.", "Modalities": ["Speech"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "5-2022", "Primary Link": "Paper", "Paper Link": "https://aclanthology.org/2022.acl-long.385/ ", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Hallucinations", "Description": "Public LLM leaderboard computed using Vectara's Hallucination Evaluation Model. This evaluates how often an LLM introduces hallucinations when summarizing a document. ", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "10-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "https://github.com/vectara/hallucination-leaderboard", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/vectara/hallucination_evaluation_model", "Added By": "Original Authors"} -{"Name": "HolisticBias", "Description": "A bias and toxicity benchmark using templated sentences, covering nearly 600 descriptor terms across 13 different demographic axes, for a total of 450k examples", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "10-2022", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2205.09209", "Website Link": "https://ai.meta.com/research/publications/im-sorry-to-hear-that-finding-new-biases-in-language-models-with-a-holistic-descriptor-dataset/", "GitHub Link": "https://github.com/facebookresearch/ResponsibleNLP/tree/main/holistic_bias", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Purple Llama CyberSecEval", "Description": "A benchmark for coding assistants, measuring their propensity to generate insecure code and level of compliance when asked to assist in cyberattacks.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://ai.meta.com/research/publications/purple-llama-cyberseceval-a-benchmark-for-evaluating-the-cybersecurity-risks-of-large-language-models/", "GitHub Link": "https://github.com/facebookresearch/PurpleLlama/tree/main/CybersecurityBenchmarks", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Purple Llama Guard", "Description": "A tool to identify and protect against malicious inputs to LLMs.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2312.06674", "Website Link": "https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/", "GitHub Link": "https://github.com/facebookresearch/PurpleLlama/tree/main/Llama-Guard", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Racial disparities in automated speech recognition", "Description": "A discussion of racial disparities and inclusiveness in automated speech recognition.", "Modalities": ["Speech"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "3-2020", "Primary Link": "Paper", "Paper Link": "", "Website Link": "https://www.pnas.org/doi/10.1073/pnas.1915768117", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "RealToxicityPrompts", "Description": "A dataset of 100k sentence snippets from the web for researchers to further address the risk of neural toxic degeneration in models.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "9-2020", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2009.11462", "Website Link": "https://toxicdegeneration.allenai.org/", "GitHub Link": "https://github.com/allenai/real-toxicity-prompts", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Red Teaming LMs with LMs", "Description": "A method for using one language model to automatically find cases where a target LM behaves in a harmful way, by generating test cases (\"red teaming\")", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "2-2022", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2202.03286", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Safety evaluation repository", "Description": "A repository of safety evaluations, across all modalities and harms, as of late 2023. Useful for delving deeper if the following evaluations don't meet your needs.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "10-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://dpmd.ai/46CPd58", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "SimpleSafetyTests", "Description": "Small probe set (100 English text prompts) covering severe harms: child abuse, suicide, self-harm and eating disorders, scams and fraud, illegal items, and physical harm", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "11-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2311.08370", "Website Link": "", "GitHub Link": "https://github.com/bertiev/SimpleSafetyTests", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "SneakyPrompt", "Description": "Automated jailbreaking method to generate NSFW content even with models that have filters applied", "Modalities": ["Vision"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "5-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2305.12082", "Website Link": "", "GitHub Link": "https://github.com/Yuchen413/text2image_safety", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "StableBias", "Description": "Bias testing benchmark for Image to Text models, based on gender-occupation associations.", "Modalities": ["Vision"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "3-2023", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "https://arxiv.org/abs/2303.11408", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/spaces/society-ethics/StableBias", "Added By": "Original Authors"} -{"Name": "Cerebras Model Lab", "Description": "A calculator to apply compute-optimal scaling laws for a given budget, including factoring expected total inference usage.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Training: Efficiency & Resource Allocation"], "Date": "5-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://www.cerebras.net/model-lab/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "QLoRa", "Description": "An efficient finetuning approach that reduces memory usage while training.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Training: Efficiency & Resource Allocation"], "Date": "5-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2305.14314", "Website Link": "", "GitHub Link": "https://github.com/artidoro/qlora", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Scaling Data-Constrained Language Models", "Description": "Demonstrates an optimal allocation of compute when dataset size is bounded", "Modalities": ["Text"], "Categories": ["Model Training: Efficiency & Resource Allocation"], "Date": "5-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2305.16264", "Website Link": "", "GitHub Link": "https://github.com/huggingface/datablations", "HuggingFace Link": "https://huggingface.co/datablations", "Added By": "Original Authors"} -{"Name": "Training Compute-Optimal Language Models", "Description": "Proposes an optimal allocation of computational budget between model and dataset size, and shows experimental design for fitting scaling laws for compute allocation in a new setting.", "Modalities": ["Text"], "Categories": ["Model Training: Efficiency & Resource Allocation"], "Date": "3-2022", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2203.15556", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "AI Incident Database", "Description": "A database of harmful incidents tied to AI systems where developers or users can submit incident reports", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Usage Monitoring"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://incidentdatabase.ai/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "BigScience Ethical Charter", "Description": "Outlines BigScience's core values and how they promote them, which in turn guides use restrictions and communicates acceptable usage to users", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Usage Monitoring"], "Date": "6-2022", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://bigscience.huggingface.co/blog/bigscience-ethical-charter", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Llama 2 Responsible Use Guide", "Description": "Guidance for downstream developers on how to responsibly build with Llama 2. Includes details on how to report issues and instructions related to red-teaming and RLHF", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Usage Monitoring"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://ai.meta.com/llama/responsible-use-guide/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Model Gating from Hugging Face", "Description": "A resource describing how to require user credentials for model access, which may be appropriate for models trained for topics such as hate speech", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Usage Monitoring"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://huggingface.co/docs/hub/models-gated", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Model Monitoring in Practice Tutorial", "Description": "A tutorial given at FAccT and other venues describing how and why to monitor ML models. Includes a presentation on using transformer models to monitor for error detection", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Usage Monitoring"], "Date": "6-2022", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://sites.google.com/view/model-monitoring-tutorial", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Robust Invisible Video Watermarking with Attention", "Description": "A widely used watermark for video models ", "Modalities": ["Vision"], "Categories": ["Usage Monitoring"], "Date": "9-2029", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/1909.01285", "Website Link": "", "GitHub Link": "https://github.com/DAI-Lab/RivaGAN", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Robust Distortion-free Watermarks for Language Models", "Description": "A watermark for autoregressive language models", "Modalities": ["Text"], "Categories": ["Usage Monitoring"], "Date": "7-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/abs/2307.15593", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "A Holistic Approach to Undesired Content Detection in the Real World", "Description": "Description of five primary categories (Sexual, Hateful, Violent, Self-harm, Harassment) with sub-categories (e.g. Sexual / sexual content involving minors). Also describes a moderation filter (the OpenAI moderation endpoint), and releases a dataset labelled for the categories.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "2-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2208.03274.pdf", "Website Link": "", "GitHub Link": "https://github.com/openai/moderation-api-release", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Perspective API", "Description": "Perspective API for content moderation. It has three classes of categories, each with 6+ attributes. (1) Production (Toxicity, Severe Toxicity, Identity Attack, Insult, Profanity, and Threats), (2) Experimental (Toxicity, Severe Toxicity, Identity Attack, Insult, Profanity, Threat, Sexually Explicit, and Flirtation), (3) NY Times (Attack on author, Attack on commenter, Incoherent, Inflammatory, Likely to Reject, Obscene, Spam, Unsubstantial).", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "8-2022", "Primary Link": "Paper", "Paper Link": "https://dl.acm.org/doi/pdf/10.1145/3534678.3539147", "Website Link": "https://developers.perspectiveapi.com/s/about-the-api-attributes-and-languages?language=en_US", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Mistral in-context self-reflection safety prompt", "Description": "Self-reflection prompt for use as a content moderation filter. It returns a binary value (safe/not) with 13 subcategories: Illegal, Child abuse, Hate Violence Harassment, Malware, Physical Harm, Economic Harm, Fraud, Adult, Political campaigning or lobbying, Privacy invasion, Unqualified law advice, Unqualified financial advice, Unqualified health advice", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "10-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2310.06825.pdf", "Website Link": "https://www.promptingguide.ai/models/mistral-7b", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Google, Gemini API Safety Filters (via Vertex)", "Description": "Safety filter for Gemini models, available through Vertex. 4 safety attributes are described: Hate speech, Harassment, Sexually Explicit, and Dangerous Content. Probabilities are returned for each attribute (Negligible, Low, Medium, High). ", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://cloud.google.com/vertex-ai/docs/generative-ai/multimodal/configure-safety-attributes", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Google, PaLM API Safety Filters (via Vertex)", "Description": "Safety filter for PaLM models, available through Vertex. 16 safety attributes are described (some of which are 'topical' rather than purely safety risks): Derogatory, Toxic, Violent, Sexual, Insult, Profanity, Death Harm & Tragedy, Firearms & Weapons, Public safety, Health, Religion & belief, Illicit drugs, War & conflict, Politics, Finance, Legal.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "3-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://cloud.google.com/vertex-ai/docs/generative-ai/configure-safety-attributes-palm", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "ActiveFence's LLM Safety Review: Benchmarks and Analysis", "Description": "Description of 4 risk categories, as part of a benchmark review of LLM safety: (1) Hate, (2) Misinformation, (3) Self-harm & Suicide, (4) Child abuse & exploitation.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "07-2023", "Primary Link": "Paper", "Paper Link": "https://www.activefence.com/LLMSafety", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Anthropic content moderation prompt", "Description": "In-context prompt for assessing whether messages and responses contain inappropriate content: \"violent, illegal or pornographic activities\"", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://docs.anthropic.com/claude/docs/content-moderation", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Red Teaming Language Models to Reduce Harms: Methods, Scaling Behaviors, and Lessons Learned", "Description": "Description of 20 risk areas, as part of red teaming Anthropics' models. Two of the tags are not interpretable (\"Other\" and \"N/A - Invalid attempt\"): Discrimination & justice, Hate speech & offensive language, Violence & incitement, Non-violent unethical behaviour (e.g. lying, cheating), Bullying & harassment, Other, Theft, N/A - Invalid attempt, Soliciting personally identifiable information, Conspiracy theories & misinformation, Substance abuse & banned substances, Fraud & deception, Weapons, Adult content, Property crime & vandalism, Animal abuse, Terrorism & organized crime, Sexual exploitation & human trafficking, Self-harm, Child abuse.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "10-2022", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2209.07858.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "BEAVERTAILS: Towards Improved Safety Alignment of LLM via a Human-Preference Dataset", "Description": "Description of 14 risk areas, as part of a QA dataset for aligning models and evaluating their safety: Hate Speech, Offensive Language, Discrimination, Stereotype, Injustice, Violence, Aiding and Abetting, Incitement, Financial Crime, Property Crime, Theft, Privacy Violation, Drug Abuse, Weapons, Banned Substance, Non-Violent Unethical Behavior, Sexually Explicit, Adult Content, Controversial Topics, Politics, Misinformation Re. ethics, laws and safety, Terrorism, Organized Crime, Self-Harm, Animal Abuse, Child Abuse", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "10-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2307.04657.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Safety Assessment of Chinese Large Language Models", "Description": "Description of 8 risk areas (called \"safety scenarios)\": Insult, Unfairness and Discrimination, Crimes and Illegal Activities, Sensitive Topics, Physical Harm, Mental health, Privacy and Property, Ethics and Morality. Six \"instruction attacks\" are also described: Goal hijacking, Prompt leaking, RolePlay Instruction, Unsafe Instruction Topic, Inquiry with Unsafe Opinion, Reverse Exposure.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "4-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2304.10436.pdf", "Website Link": "", "GitHub Link": "https://github.com/thu-coai/Safety-Prompts", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "DECODINGTRUST: A Comprehensive Assessment of Trustworthiness in GPT Models", "Description": "Description of 8 evaluation areas: toxicity, stereotypes bias, adversarial robustness, out-of-distribution robustness, robustness against adversarial demonstrations, privacy, machine ethics, fairness.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "1-2024", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2306.11698.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "A Unified Typology of Harmful Content", "Description": "Taxonomy of harmful online content. There are 4 primary categories, which each have subcategories: (1) Hate and harassment (Doxxing, Identity attack, Identity misrepresentation, Insult, Sexual aggression, Threat of violence; (2) Self-inflicted harm (Eating disorder promotion, self-harm), (3) Ideological harm (Extremism Terrorism & Organized crime, Misinformation), (4) Exploitation (Adult sexual services, Child sexual abuse material, Scams).", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "Taxonomy of harmful online content. There are 4 primary categories, which each have subcategories: (1) Hate and harassment (Doxxing, Identity attack, Identity misrepresentation, Insult, Sexual aggression, Threat of violence; (2) Self-inflicted harm (Eating disorder promotion, self-harm), (3) Ideological harm (Extremism Terrorism & Organized crime, Misinformation), (4) Exploitation (Adult sexual services, Child sexual abuse material, Scams).", "Primary Link": "Paper", "Paper Link": "https://aclanthology.org/2020.alw-1.16.pdf", "Website Link": "https://docs.cohere.com/docs/content-moderation-with-classify", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Towards Safer Generative Language Models: A Survey on Safety Risks, Evaluations, and Improvements", "Description": "Description of 7 risk areas, as part of a survey on LLM risks: Toxicity and Abusive Content, Unfairness and Discrimination, Ethics and Morality Issues, Controversial Opinions, Misleading Information, Privacy and Data Leakage, Malicious Use and Unleashing AI Agents.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "11-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2302.09270.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Llama 2: Open Foundation and Fine-Tuned Chat Models", "Description": "Description of 3 risk areas, as part of the safety checks for releasing Llama2: (1) illicit and criminal activities (terrorism, theft, huam trafficking), (2) hateful and harmful activities (defamation, self-harm, eating disorders, discrimination), and (3) unqualified advice (medical, financial and legal advice). Other risk categories are described as part of red teaming and soliciting feedback.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "7-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2307.09288.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Ethical and social risks of harm from Language Models", "Description": "Two-tier taxonomy of risks, comprising both classification groups (of which there are 6) and associated harms (3 or 4 for each classification group). The classification groups are: (1) Discrimination, Exclusion and Toxicity, (2) Information Hazards, (3) Misinformation Harms, (4) Malicious Uses, (5) Human-Computer Interaction Harms, and (6) Automation, access, and environmental harms.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "12-2021", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2112.04359.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Sociotechnical Safety Evaluation of Generative AI Systems", "Description": "Two-tier taxonomy of risks, comprising both classification groups (of which there are 6) and associated harms (3 or 4 for each classification group). The classification groups are: (1) Representation and Toxicity Harms, (2) Misinformation Harms, (3) Information & Society Harms, (4) Malicious Use, (5) Human Autonomy & Integrity Harms, and (6) Socioeconomic & Environmental Harms.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "10-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2310.11986.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language Models' Alignment", "Description": "Two-tier taxonomy of risks, with seven major categories of LLM trustworthiness, each of which has several associated sub-categories: (1) Reliability, (2) Safety, (3) Fairness, (4) Resistance to Misuse, (5) Explainability and Reasoning, (6) Social Norms, and (7) Robustness.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "8-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2308.05374.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Process for Adapting Language Models to Society (PALMS) with Values-Targeted Datasets", "Description": "Description of 8 risk areas, as part of describing methods for aligning models: (1) Abuse, Violence and Threat (inclusive of self-harm), (2) Health (phyiscal and mental), (3) Human characteristics and behaviour, (4) Injustice and inequality (incl, discrimination, harmful stereotypes), (5) Political opinion and destabilization, (6) Relationships (romantic, familial friendships), (7) Sexual activity (inclusive of pornography), (8) Terrorism (inclusive of white supremacy).", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "6-2021", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2106.10328.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Sociotechnical Harms of Algorithmic Systems: Scoping a Taxonomy for Harm Reduction", "Description": "Description of 5 categories of harm, with detailed subcategories: (1) Representational harms, (2) Allocative harms, (3) Quality of Service harms, (4) Interpersonal harms, and (5) Social system harms. ", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "7-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2210.05791.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Deepfakes, Phrenology, Surveillance, and More! A Taxonomy of AI Privacy Risks", "Description": "Taxonomy of 12 privacy risks, based on reviewing 321 privacy-related incidents, filtered from the AI, Algorithmic and Automation Incident and Controversy Repository (AIAAIC) Database. Risks are split into those that are created by AI (Identification, Distortion, Exposure, Aggregation, Phrenology/Physiognomy) and those that are exacerbated by AI (Intrusion, Surveillance, Exclusion, Secondary Use, Insecurity, Increased Accessibility).", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "10-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2310.07879.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The Ethical Implications of Generative Audio Models: A Systematic Literature Review", "Description": "Taxonomy of 12 \"negative broader impacts\" from generative models involving speech and music.", "Modalities": ["Speech"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "7-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2307.05527.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "An Overview of Catastrophic AI Risks", "Description": "Taxonomy of 4 catastrophic AI risks, with subcategories: (1) Malicious use (Bioterrrorism, Uncontrolled AI agents, AI capabilities for propaganda, Censorship and surveillance), (2) AI race (Autonomous weapons, Cyberwarfare, Automated human labour [mass unemployment and dependence on AI systems], (3) Organizational risks (AI accidentally leaked/stolen), (4) Rogue AIs (Proxy gaming, Goal drift, Power-seeking, Deception).", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "9-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2306.12001.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The Malicious Use of Artificial Intelligence: Forecasting, Prevention, and Mitigation", "Description": "Taxonomy of 3 AI security risks, with subcategories: (1) Digital Security, Physical Security, Political Security.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "2-2018", "Primary Link": "Paper", "Paper Link": "https://img1.wsimg.com/blobby/go/3d82daa4-97fe-4096-9c6b-376b92c619de/downloads/MaliciousUseofAI.pdf?ver=1553030594217", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Open-sourcing highly capable foundation models", "Description": "Description of risks from malicious use of AI: Influence operations, Surveillance and population control, Scamming and spear phishing, Cyber attacks, Biological and chemical weapons development. Some \"extreme risks\" are also described in the paper (e.g. disruption to key societal functions).", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "9-2023", "Primary Link": "Paper", "Paper Link": "https://cdn.governance.ai/Open-Sourcing_Highly_Capable_Foundation_Models_2023_GovAI.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "How Does Access Impact Risk? Assessing AI Foundation Model Risk Along a Gradient of Access ", "Description": "Description of risks from open-sourcing models, including five instances of malicious use: (1) Fraud and other crime schemes, (2) Undermining of social cohesion and democratic processes, (3) Human rights abuses, (4) Disruption of critical infrastructure, and (5) State conflict. ", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "12-2023", "Primary Link": "Paper", "Paper Link": "https://securityandtechnology.org/wp-content/uploads/2023/12/How-Does-Access-Impact-Risk-Assessing-AI-Foundation-Model-Risk-Along-A-Gradient-of-Access-Dec-2023.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "OpenAI Preparedness Framework (Beta)", "Description": "Description of 4 catastrophic AI risks: (1) Cybersecurity, (2) Chemical, Biological, Nuclear and Radiological (CBRN) threats, (3) Persuasion, and (4) Model autonomy. The paper also highlights the risk of \"unknown unknowns\".", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "12-2023", "Primary Link": "Paper", "Paper Link": "https://cdn.openai.com/openai-preparedness-framework-beta.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Anthropic's Responsible Scaling Policy", "Description": "Framework with four tiers of model capability, ffrom ASL-1 (smaller models) to ASL-4 (speculative), with increasing risk as models' capability increases. It also describes 4 catastrophic AI risks: (1) Misuse risks, (2) CBRN risks, (3) Cyber risks, and (4) Autonomy and replication risks.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "9-2023", "Primary Link": "Paper", "Paper Link": "https://www-files.anthropic.com/production/files/responsible-scaling-policy-1.0.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Model evaluation for extreme risks", "Description": "Framework of 9 dangerous capabilities of AI models: (1) Cyber-offense, (2) Deception, (3) Persuasion & manipulation, (4) Politial strategy, (5) Weapons acquisition, (6) Long-horizon planning, (7) AI development, (8) Situational awareness, (9) Self-proliferation.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "9-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2305.15324.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Frontier AI Regulation: Managing Emerging Risks to Public Safety", "Description": "Description of \"sufficiently dangerous capabilities\" of AI models to cause serious harm and disruption on a global scale, such as synthesing new biological or chemical weapons and evading human control through means of deception and obfuscation.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "11-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2307.03718.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "The Fallacy of AI Functionality", "Description": "Taxonomy of four AI failure points: (1) Impossible tasks (either Conceptually impossible or Practically impossible), (2) Engineering failures (Design failures, Implementation failures, Missing Safety Features), (3) Post-Deployment Failures (Robustness Issues, Failure under Adversarial Attacks, Unanticipated Intractions, (4) Communication Failures (Falsified or Overstated Capabilities, Misrepresented Capabilities).", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "7-2022", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2206.09511.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "TASRA: a Taxonomy and Analysis of Societal-Scale Risks from AI", "Description": "Framework of 3 potential harms from AI: (1) Harm to people (individual harm, Group/community harm, Societal harm), (2) Harm to an Organisation or Enterprise, (3) Harm to a system. ", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms Taxonomies"], "Date": "6-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2306.06924.pdf", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Cohere in-context content moderation prompt", "Description": "Few-shot prompt for classifying whether text is toxic or not.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "12-2023", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://docs.cohere.com/reference/toxicity-detection", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "NVidia NeMo Guardrails", "Description": "Open-source tooling to create guardrails for LLM applications.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "4-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2310.10501.pdf", "Website Link": "https://blogs.nvidia.com/blog/ai-chatbot-guardrails-nemo/", "GitHub Link": "https://github.com/NVIDIA/NeMo-Guardrails", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "SafetyPrompts", "Description": "Open repository of datasets for LLM safety", "Modalities": ["Text"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "1-2024", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://safetyprompts.com/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Model Risk Cards", "Description": "A framework for structured assessment and documentation of risks associated with an application of language models. Each RiskCard makes clear the routes for the risk to manifest harm, their placement in harm taxonomies, and example prompt-output pairs. The paper also describes 70+ risks identified from a literature survey.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Model Evaluation: Risks & Harms"], "Date": "3-2023", "Primary Link": "Paper", "Paper Link": "https://arxiv.org/pdf/2303.18190.pdf", "Website Link": "", "GitHub Link": "https://github.com/leondz/lm_risk_cards", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Aya Dataset", "Description": "A permissively licensed multilingual instruction finetuning dataset curated by the Aya Annotation Platform from Cohere For AI. The dataset contains a total of 204k human-annotated prompt-completion pairs along with the demographics data of the annotators, spanning 65 languages.", "Modalities": ["Text"], "Categories": ["Finetuning Data Catalogs"], "Date": "2-2024", "Primary Link": "Webpage", "Paper Link": "https://arxiv.org/abs/2402.06619", "Website Link": "https://cohere.com/research/aya", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/CohereForAI/aya_dataset", "Added By": "Original Authors"} -{"Name": "HuggingFace Provenance, Watermarking & Deepfake Detection Collection", "Description": "A collection of resources on provenance, watermarking & deepfake detection tools, that are used to assess the outputs of foundation models.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Usage Monitoring"], "Date": "2-2024", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/collections/society-ethics/provenance-watermarking-and-deepfake-detection-65c6792b0831983147bb7578", "Added By": "Original Authors"} -{"Name": "SIB-200", "Description": "A large-scale open-sourced benchmark dataset for topic classification in 200 languages and dialects.", "Modalities": ["Text"], "Categories": ["Model Evaluation: Capabilities"], "Date": "9-2023", "Primary Link": "GitHub", "Paper Link": "https://arxiv.org/abs/2309.07445", "Website Link": "", "GitHub Link": "https://github.com/dadelani/sib-200", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "French-PD-Newpapers", "Description": "Nearly three million unique newspaper and periodical editions (70B words) from the French National Library.", "Modalities": ["Text"], "Categories": ["Pretraining Data Sources"], "Date": "1-2024", "Primary Link": "Hugging Face object", "Paper Link": "", "Website Link": "", "GitHub Link": "", "HuggingFace Link": "https://huggingface.co/datasets/PleIAs/French-PD-Newspapers", "Added By": "Original Authors"} -{"Name": "Datatrove", "Description": "A library to process, filter and deduplicate text data at a very large scale", "Modalities": ["Text"], "Categories": ["Data Cleaning, Filtering, & Mixing"], "Date": "12-2023", "Primary Link": "GitHub", "Paper Link": "", "Website Link": "", "GitHub Link": "https://github.com/huggingface/datatrove", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "Nomic", "Description": "A proprietary service to explore data with embedding maps.", "Modalities": ["Text"], "Categories": ["Data Search, Analysis, & Exploration"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://home.nomic.ai/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} -{"Name": "AI Vulnerability Database", "Description": "An open-source, extensible knowledge base of AI failures.", "Modalities": ["Text", "Speech", "Vision"], "Categories": ["Usage Monitoring"], "Date": "Frequently Updated", "Primary Link": "Webpage", "Paper Link": "", "Website Link": "https://avidml.org/", "GitHub Link": "", "HuggingFace Link": "", "Added By": "Original Authors"} diff --git a/app/scripts/clearModules.js b/app/scripts/clearModules.js new file mode 100644 index 0000000..1b9c74a --- /dev/null +++ b/app/scripts/clearModules.js @@ -0,0 +1,14 @@ +const fs = require("fs"); + +const clearModules = (filePath) => { + if (fs.existsSync(filePath)) { + let fileContent = fs.readFileSync(filePath, "utf8"); + fileContent = fileContent.replace(/require\s*\([\s\S]*?\)/, ""); + fs.writeFileSync(filePath, fileContent, "utf8"); + } else { + console.log("File does not exist."); + } +}; + +clearModules("go.mod"); +clearModules("exampleSite/go.mod"); diff --git a/app/scripts/projectSetup.js b/app/scripts/projectSetup.js new file mode 100644 index 0000000..3ab7306 --- /dev/null +++ b/app/scripts/projectSetup.js @@ -0,0 +1,116 @@ +const fs = require("fs"); +const path = require("path"); + +const toggleComment = ({ filepath, regex }) => { + let updatedContent = fs.readFileSync(filepath, "utf8"); + const match = updatedContent.match(regex); + + if (match) { + const matchedContent = match[0]; + const hasComment = matchedContent.startsWith("# "); + if (hasComment) { + updatedContent = updatedContent.replace( + regex, + matchedContent.replace("# ", ""), + ); + fs.writeFileSync(filepath, updatedContent, "utf8"); + } else { + const hasBreakline = matchedContent.includes("\n"); + if (hasBreakline) { + const content = matchedContent + .split("\n") + .map((line) => "# " + line) + .join("\n"); + updatedContent = updatedContent.replace(regex, content); + fs.writeFileSync(filepath, updatedContent, "utf8"); + } + } + } +}; + +const getFolderName = (rootfolder) => { + const configPath = path.join(rootfolder, "exampleSite/hugo.toml"); + const getConfig = fs.readFileSync(configPath, "utf8"); + const match = getConfig.match(/theme\s*=\s*\[?"([^"\]]+)"\]?/); + let selectedTheme = null; + if (match && match[1]) { + selectedTheme = match[1]; + } + return selectedTheme; +}; + +const deleteFolder = (folderPath) => { + if (fs.existsSync(folderPath)) { + fs.rmSync(folderPath, { recursive: true, force: true }); + } +}; + +const createNewfolder = (rootfolder, folderName) => { + const newFolder = path.join(rootfolder, folderName); + fs.mkdirSync(newFolder, { recursive: true }); + return newFolder; +}; + +const iterateFilesAndFolders = (rootFolder, { destinationRoot }) => { + const directory = path.join(rootFolder); + const items = fs.readdirSync(directory, { withFileTypes: true }); + items.forEach((item) => { + if (item.isDirectory()) { + createNewfolder(destinationRoot, item.name); + iterateFilesAndFolders(path.join(directory, item.name), { + currentFolder: item.name, + destinationRoot: path.join(destinationRoot, item.name), + }); + } else { + const sourceFile = path.join(directory, item.name); + const destinationFile = path.join(destinationRoot, item.name); + fs.renameSync(sourceFile, destinationFile); + } + }); +}; + +const setupProject = () => { + const rootfolder = path.join(__dirname, "../"); + if (!fs.existsSync(path.join(rootfolder, "themes"))) { + // remove this part if you don't using theme demo as a module + [ + { + filepath: path.join(rootfolder, "exampleSite/hugo.toml"), + regex: /^.*theme\s*=\s*("[^"\]]+"|\S+)/m, + }, + { + filepath: path.join( + rootfolder, + "exampleSite/config/_default/module.toml", + ), + regex: /\[\[imports\]\]\s*\r?\npath = "([^"]+)"/, + }, + ].forEach(toggleComment); + + const folderList = ["layouts", "assets", "static"]; + const folderName = getFolderName(rootfolder); + const newfolderName = createNewfolder( + path.join(rootfolder, "themes"), + folderName, + ); + + folderList.forEach((folder) => { + const source = path.join(rootfolder, folder); + const destination = path.join(newfolderName, folder); + if (fs.existsSync(source)) { + fs.mkdirSync(destination, { recursive: true }); + iterateFilesAndFolders(source, { + currentFolder: folder, + destinationRoot: destination, + }); + deleteFolder(source); + } + }); + + const exampleSite = path.join(rootfolder, "exampleSite"); + iterateFilesAndFolders(exampleSite, { destinationRoot: rootfolder }); + deleteFolder(exampleSite); + } +}; + +setupProject(); diff --git a/app/scripts/removeDarkmode.js b/app/scripts/removeDarkmode.js new file mode 100644 index 0000000..cb1b863 --- /dev/null +++ b/app/scripts/removeDarkmode.js @@ -0,0 +1,69 @@ +const fs = require("fs"); +const path = require("path"); + +const rootDirs = ["assets/scss", "layouts"]; +const configFiles = [ + { + filePath: "exampleSite/tailwind.config.js", + patterns: ["darkmode:\\s*{[^}]*},", 'darkMode:\\s*"class",'], + }, + { + filePath: "exampleSite/data/theme.json", + patterns: ["colors.darkmode"], + }, +]; + +rootDirs.forEach(removeDarkModeFromPages); +configFiles.forEach(removeDarkMode); + +function removeDarkModeFromFiles(filePath, regexPatterns) { + const fileContent = fs.readFileSync(filePath, "utf8"); + let updatedContent = fileContent; + regexPatterns.forEach((pattern) => { + const regex = new RegExp(pattern, "g"); + updatedContent = updatedContent.replace(regex, ""); + }); + fs.writeFileSync(filePath, updatedContent, "utf8"); +} + +function removeDarkModeFromPages(directoryPath) { + const files = fs.readdirSync(directoryPath); + + files.forEach((file) => { + const filePath = path.join(directoryPath, file); + const stats = fs.statSync(filePath); + if (stats.isDirectory()) { + removeDarkModeFromPages(filePath); + } else if (stats.isFile()) { + removeDarkModeFromFiles(filePath, [ + '(?:(?!["])\\S)*dark:(?:(?![,;"])\\S)*', + "@apply?(\\s)*;", + ]); + } + }); +} + +function removeDarkMode(configFile) { + const { filePath, patterns } = configFile; + if (filePath === "exampleSite/tailwind.config.js") { + removeDarkModeFromFiles(filePath, patterns); + } else { + const contentFile = JSON.parse(fs.readFileSync(filePath, "utf8")); + patterns.forEach((pattern) => deleteNestedProperty(contentFile, pattern)); + fs.writeFileSync(filePath, JSON.stringify(contentFile)); + } +} + +function deleteNestedProperty(obj, propertyPath) { + const properties = propertyPath.split("."); + let currentObj = obj; + for (let i = 0; i < properties.length - 1; i++) { + const property = properties[i]; + if (currentObj.hasOwnProperty(property)) { + currentObj = currentObj[property]; + } else { + return; // Property not found, no need to continue + } + } + delete currentObj[properties[properties.length - 1]]; +} diff --git a/app/scripts/themeSetup.js b/app/scripts/themeSetup.js new file mode 100644 index 0000000..0d21aad --- /dev/null +++ b/app/scripts/themeSetup.js @@ -0,0 +1,125 @@ +const fs = require("fs"); +const path = require("path"); + +const toggleComment = ({ filepath, regex }) => { + let updatedContent = fs.readFileSync(filepath, "utf8"); + const match = updatedContent.match(regex); + + if (match) { + const matchedContent = match[0]; + const hasComment = matchedContent.startsWith("# "); + if (hasComment) { + const hasBreakline = matchedContent.includes("\n"); + if (hasBreakline) { + updatedContent = updatedContent.replace( + regex, + matchedContent.replace(/# /gm, ""), + ); + fs.writeFileSync(filepath, updatedContent, "utf8"); + } + } else { + updatedContent = updatedContent.replace(regex, "# " + matchedContent); + fs.writeFileSync(filepath, updatedContent, "utf8"); + } + } +}; + +const createNewfolder = (rootfolder, folderName) => { + const newFolder = path.join(rootfolder, folderName); + fs.mkdirSync(newFolder, { recursive: true }); + return newFolder; +}; + +const deleteFolder = (folderPath) => { + if (fs.existsSync(folderPath)) { + fs.rmSync(folderPath, { recursive: true, force: true }); + } +}; + +const getFolderName = (rootfolder) => { + const configPath = path.join(rootfolder, "exampleSite/hugo.toml"); + const getConfig = fs.readFileSync(configPath, "utf8"); + const match = getConfig.match(/theme\s*=\s*\[?"([^"\]]+)"\]?/); + let selectedTheme = null; + if (match && match[1]) { + selectedTheme = match[1]; + } + return selectedTheme; +}; + +const iterateFilesAndFolders = (rootFolder, { destinationRoot }) => { + const directory = path.join(rootFolder); + const items = fs.readdirSync(directory, { withFileTypes: true }); + items.forEach((item) => { + if (item.isDirectory()) { + createNewfolder(destinationRoot, item.name); + iterateFilesAndFolders(path.join(directory, item.name), { + currentFolder: item.name, + destinationRoot: path.join(destinationRoot, item.name), + }); + } else { + const sourceFile = path.join(directory, item.name); + const destinationFile = path.join(destinationRoot, item.name); + fs.renameSync(sourceFile, destinationFile); + } + }); +}; + +const setupTheme = () => { + const rootFolder = path.join(__dirname, "../"); + + if (!fs.existsSync(path.join(rootFolder, "exampleSite"))) { + // remove this part if you don't using theme demo as a module + [ + { + filepath: path.join(rootFolder, "config/_default/module.toml"), + regex: /# \[\[imports\]\]\s*\r?\n# path = "([^"]+)"/, + }, + { + filepath: path.join(rootFolder, "hugo.toml"), + regex: /^.*theme\s*=\s*("[^"\]]+"|\S+)/m, + }, + ].forEach(toggleComment); + + const includesFiles = [ + "tailwind.config.js", + "postcss.config.js", + "go.mod", + "hugo.toml", + "assets", + "config", + "data", + "content", + "i18n", + "static", + ]; + + const folder = createNewfolder(rootFolder, "exampleSite"); + + fs.readdirSync(rootFolder, { withFileTypes: true }).forEach((file) => { + if (includesFiles.includes(file.name)) { + if (file.isDirectory()) { + const destination = path.join(rootFolder, "exampleSite", file.name); + fs.mkdirSync(destination, { recursive: true }); + iterateFilesAndFolders(path.join(rootFolder, file.name), { + destinationRoot: destination, + }); + deleteFolder(path.join(rootFolder, file.name)); + } else { + fs.renameSync( + path.join(rootFolder, file.name), + path.join(folder, file.name), + ); + } + } + }); + + const themes = path.join(rootFolder, "themes"); + iterateFilesAndFolders(path.join(themes, getFolderName(rootFolder)), { + destinationRoot: rootFolder, + }); + deleteFolder(themes); + } +}; + +setupTheme(); diff --git a/app/src/__init__.py b/app/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/src/api/__init__.py b/app/src/api/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/src/api/api.py b/app/src/api/api.py deleted file mode 100644 index 0f56714..0000000 --- a/app/src/api/api.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 - -import warnings - -import pandas as pd - - -def is_date_match(release_date, filter_start, filter_end="2030"): - def convert_to_dt(x): - if isinstance(x, str): - return ( - pd.to_datetime(x, format="%m-%Y") - if "-" in x - else pd.to_datetime(x, format="%Y") - ) - return x - - if not release_date or release_date.lower() == "frequently updated": - return True - - try: - release_datetime = convert_to_dt(release_date) - filter_start_dt = convert_to_dt(filter_start) - filter_end_dt = convert_to_dt(filter_end) - - return filter_start_dt <= release_datetime <= filter_end_dt - except Exception as e: - warnings.warn(f"Error in date comparison: {e}") - return False - - -# Function to apply the date filter -def apply_date_filter(df, release_date): - return df[df["Date"].apply(lambda x: is_date_match(x, release_date))] - - -# Function to search for resources -def search_resources(df, search_string): - return df[ - df.apply( - lambda row: search_string.lower() in row["Name"].lower() - or search_string.lower() in row["Description"].lower(), - axis=1, - ) - ] - - -def filter_resources( - resources_df, sections, text_mod, vision_mod, speech_mod, time_range -) -> pd.DataFrame: - # breakpoint() - - # Apply sections filter - if "All" not in sections: - resources_df = resources_df[ - resources_df["Categories"].apply( - lambda x: any(item in sections for item in x) - ) - ] - - # Apply combined modality filter - allowed_modalities = { - m - for m, flag in zip( - ["Text", "Vision", "Speech"], [text_mod, vision_mod, speech_mod] - ) - if flag - } - resources_df = resources_df[ - resources_df["Modalities"].apply( - lambda mods: any(mod in allowed_modalities for mod in mods) - ) - ] - - # Apply date filter - resources_df = apply_date_filter(resources_df, time_range) - - return resources_df diff --git a/app/src/components/__init__.py b/app/src/components/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/src/components/goat_counter.py b/app/src/components/goat_counter.py deleted file mode 100644 index 9ff8d15..0000000 --- a/app/src/components/goat_counter.py +++ /dev/null @@ -1,35 +0,0 @@ -import pathlib - -import streamlit as st -from bs4 import BeautifulSoup - -from src.utils import get_environment - - -def add_goat_counter_tracker(): - script_id = "skiff-stats" - index_path = pathlib.Path(st.__file__).parent / "static" / "index.html" - - skiff_stats_script = f""" - - """ - - if get_environment() == "prod": - # This is an extreme workaround to be able to inject a script tag into the streamlit template. - # Streamlit goes through well-meaning lengths to prevent this for security reasons. - # So we are essentially altering the base streamlit index.html file. - # inspired by: https://github.com/streamlit/streamlit/issues/969#issuecomment-1030832993 - soup = BeautifulSoup(index_path.read_text(), features="lxml") - if not soup.find(id=script_id): # if cannot find tag - html = str(soup) - # Insert the script directly in the head tag of the static template - new_html = html.replace("", skiff_stats_script + "") - index_path.write_text(new_html) - else: - # Search for and remove the Skiff Stats script if it is found. - # This is mainly a tool for aiding development in case the prod ENV is being tested locally. - soup = BeautifulSoup(index_path.read_text(), features="lxml") - if soup.find(id=script_id): # if find tag, remove it - html = str(soup) - new_html = html.replace(skiff_stats_script, "") - index_path.write_text(new_html) diff --git a/app/src/constants.py b/app/src/constants.py deleted file mode 100644 index 5f859bc..0000000 --- a/app/src/constants.py +++ /dev/null @@ -1,252 +0,0 @@ -from pathlib import Path - -ORDERED_SECTION_HEADERS = { - "Pretraining Data Sources": """ - Pretraining data consists of thousands, or even millions, of individual documents, often web scraped. - As a result, their contents are often superficially documented or understood. - Model knowledge and behavior will likely reflect a compression of this information and its communication qualities. - Consequently, its important to carefully select the data composition. - This decision should reflect choices in the language coverage, the mix of sources, and preprocessing decisions. - We highlight a few of the most popular pretraining corpora which have accumulated deeper documentation or analyses. - """, - "Finetuning Data Catalogs": """ - Finetuning data is used for a variety of reasons: to hone specific capabilities, orient the model to a certain task - format, improve its responses to general instructions, mitigate harmful or unhelpful response patterns, or generally - align its responses to human preferences. - Developers increasingly use a variety of data annotations and loss objectives for traditional supervised finetuning, - [DPO](https://arxiv.org/abs/2305.18290) or reinforcement learning with [human feedback](https://arxiv.org/abs/2203.02155). - As a result of this variety, we recommend data catalogs, with attached documentation, to help an informed selection. - The largest catalog is [HuggingFace Datasets](https://huggingface.co/docs/datasets/index), though cross-reference its metadata with - academic papers and repositories, as its crowdsourced documentation can be sparse or incorrect. - - Aside from HuggingFace Datasets, we point to some lesser known resources that catalog more specialized finetuning - data sources. - The breadth of available finetuning data is expansive, so we focus on catalogs rather than individual datasets, - and particularly those that provide strong documentation or more specialized sources. - """, - "Data Search, Analysis, & Exploration": """ - Exploring training datasets with search and analysis tools helps practitioners develop a nuanced intuition for - what's in the data, and therefore their model. - Many aspects of data are difficult to summarize or document without hands-on exploration. - For instance, text data can have a distribution of lengths, topics, tones, formats, licenses, and even diction. - We recommend developers use the many available tools to search and analyze their training datasets. - """, - "Data Cleaning, Filtering, & Mixing": """ - Data cleaning and filtering is an important step in curating a dataset. Filtering and cleaning remove unwanted - data from the dataset. They can improve training efficiency as well as ensuring that data has desirable properties, - including: high information content, desired languages, low toxicity, and minimal personally identifiable information. - We recommend that practitioners consider the possible trade-offs when using some filters. - For example, [Dodge et al. (2021)](https://arxiv.org/abs/2104.08758) find that some filters disproportionately remove - text written by, and about, minority individuals. Additionally, [Welbl et al. (2021)](https://arxiv.org/abs/2109.07445) - and [Longpre et al. (2023)](https://arxiv.org/abs/2305.13169) find that - removing content that classifiers believe are "toxic" can have adverse affects, including lowering performance on - evaluations, and disproportionately removing text representing marginalized groups. Data mixing is another - important component of data preparation, where the mixture proportions of pretraining data domains - (e.g. scientific articles, GitHub, and books) have been shown to dramatically affect downstream performance, - as shown in [The Pile](https://arxiv.org/abs/2101.00027), [Doremi](https://crfm.stanford.edu/2023/09/14/doremi), and - [Efficient Online Data Mixing](https://arxiv.org/abs/2312.02406).""", - "Data Deduplication": """ - Data deduplication is an important preprocessing step where duplicated documents, or chunks within a document, - are removed from the dataset. Removing duplicates can reduce the likelihood of memorizing undesirable pieces - of information such as boilerplate text, copyrighted data, and personally identifiable information. Additionally, - removing duplicated data improves training efficiency by reducing the total dataset size. - Practitioners should always determine whether duplicated data will harm or help the model for their use case. - For example, memorization is a crucial component for a model intended to be used in a closed-book question - answering system, but will tend to be harmful for application-agnostic models - (see [Lee et al. (2022)](https://arxiv.org/abs/2107.06499)).""", - "Data Decontamination": """ - Data decontamination is the process of removing evaluation data from the training dataset. This important step in data - preprocessing ensures the integrity of model evaluation, ensuring that metrics are reliable and not misleading. - The following resources aid in proactively protecting test data with canaries, decontaminating data before training, and - identifying or proving what data a model was trained on. - [A Note on Canary Exposure](https://arxiv.org/abs/2306.00133) explains how to interpret canary exposure, including by relating it to membership inference - attacks, and differential privacy. - [Proving Test Set Contamination in Black Box Language Models](https://arxiv.org/abs/2310.17623) provides methods for provable guarantees of test set contamination in language models without - access to pretraining data or model weights. - """, - "Data Auditing": """ - Auditing datasets is an essential component of dataset design. You should always spend a substantial amount of - time reading through your dataset, ideally at many stages of the dataset design process. Many datasets have - problems specifically because the authors did not do sufficient auditing before releasing them. - - At early stages of a project the data search, analysis, & exploration tools, outlined in the Data Search section, - are typically sufficient to track the evolution of a dataset. However it can also be helpful to do systematic - studies of the process.""", - "Data Documentation": """ - When releasing new data resources with a model, it is important to thoroughly document the - data, (see e.g. [Data Statements](https://aclanthology.org/Q18-1041/) or the [Data Nutrition Project](https://datanutrition.org/). - Documentation allows users to understand its intended uses, legal restrictions, attribution, relevant contents, - privacy concerns, and other limitations. - It is common for datasets to be widely used by practitioners, who may be unaware of undesirable properties - (such as [CSAM](https://www.theverge.com/2023/12/20/24009418/generative-ai-image-laion-csam-google-stability-stanford)). - While many data documentation standards have been proposed, their adoption has been uneven, or when - crowdsourced, as with [Hugging Face Datasets](https://huggingface.co/docs/datasets/index), they may contain - [some errors and omissions](https://arxiv.org/abs/2310.16787). - """, - "Data Governance": """ - Releasing all datasets involved in the development of a Foundation Model, including training, fine-tuning, and - evaluation data, can facilitate external scrutiny and support further research. However, releasing and hosting - the data as it was used may not always be an option, especially when it includes data with external rights-holders; - e.g., when data subjects' privacy, intellectual property, or other rights need to be taken into account. Proper - data governance practices can be required at the curation and release stages to account for these rights. - - In some jurisdictions, projects may be required to start with a Data Management Plan that requires developers to - ensure that the data collection has a sufficient legal basis, follows principles of data minimization, and allows - data subject to have sufficient visibility into and control over their representation in a dataset - ([CNIL resource sheet](https://www.cnil.fr/en/ai-how-sheets)). Data curation steps to that end can include - respecting opt-out preference signals (Spawning, HaveIBeenTrained), or applying pseudonymization or PII redaction - (BigCode Governance card). - - Once a dataset is released, it can be made available either broadly or with access control based on research needs - (ROOTS, BigCode PII training dataset). Developers can also enable data subjects to ask for removal from the hosted - version of the dataset by providing a contact address (OSCAR, PAraCrawl), possibly complemented by a membership - test to check whether their data is included (Stack data portraits) or an automated process (BigCode, AmIinTheStack). - """, - "Model Training: Pretraining Repositories": """ - Practitioners should consider using already-optimized codebases, especially in the pre-training phase, to - ensure effective use of computational resources, capital, power, and effort. Existing open-source codebases - targeted at foundation model pretraining can make pretraining significantly more accessible to new practitioners - and help accumulate techniques for efficiency in model training. - - Here, we provide a sample of existing widely-used pre-training codebases or component tools that developers can - use as a jumping-off point for pre-training foundation models.""", - "Model Training: Finetuning Repositories": """ - Fine-tuning, or other types of adaptation performed on foundation models after pretraining, are an equally - important and complex step in model development. Fine-tuned models are more frequently deployed than base models. - - Here, we also link to some useful and widely-used resources for adapting foundation models or otherwise fine-tuning - them. Use of these tools can ensure greater ecosystem compatibility of resulting models, or reduce the barrier to - experimentation by abstracting away common pitfalls or providing guidance on effective hyperparameters.""", - "Model Training: Efficiency & Resource Allocation": """ - Knowledge of training best practices and efficiency techniques can reduce costs to train a desired model significantly. - Here, we include a select few readings and resources on effectively using a given resource budget for model training, - such as several canonical papers on fitting *scaling laws*, a common tool for extrapolating findings across - scales of cost. These are used frequently to determine the most efficient allocation of resources, such as allocating - compute between model size and dataset size for a given budget. - - Additionally, practitioners seeking to embrace an open approach to model development should consider how their - decisions when training a foundation model may have impacts long after that model's creation and release. - For instance, a model that is released openly but is too computationally demanding to be run on consumer-grade - hardware will be limited in its impact on the field, or a model trained to minimize training compute but not - minimize inference cost may result in a greater environmental impact than spending more training compute in the - first place for a cheaper-to-infer model. Practitioners should thus be aware of potential second-order effects of - their model releases and training choices.""", - "Model Training: Educational Resources": """ - Training models at any scale can be quite daunting to newer practitioners. Here, we include several educational - resources that may be useful in learning about the considerations required for successfully and effectively - training or fine-tuning foundation models, and recommend that practitioners review these resources and use them - to guide further reading about model training and usage. """, - "Environmental Impact": """ - Current tools, including the ones mentioned in the table, focus on the latter point by measuring the energy - consumed during training or inference and multiplying it by the carbon intensity of the energy source used. - While other steps of the model life cycle (e.g. manufacturing hardware, heating/cooling datacenters, storing and - transferring data) also come with environmental impacts, we currently lack the information necessary to meaningfully - measure these impacts (see [Estimating the Carbon Footprint of BLOOM](https://arxiv.org/abs/2211.02001)). - The table below outlines resources for back-of-the-envelope estimations of environmental impact, in-code estimation, - as well as dashboard for cloud computing platforms to estimate environmental impact - (see [Carbontracker](https://arxiv.org/abs/2007.03051) and - [Quantifying the Carbon Emissions of Machine Learning](https://arxiv.org/abs/1910.09700)). - - For efficient use of resources, several decisions made during or prior to model training can have significant impacts on the upstream and - downstream environmental impact of a given model. - Use [Scaling Laws](https://arxiv.org/abs/2001.08361) and other methodologies to find the best allocation of your compute budget. - For models frequently used downstream, consider the inference footprint and inference cost during model creation, - to minimize the environmental impact of inference (see - [Scaling Data-Constrained Language Models](https://arxiv.org/abs/2305.16264)). - For further resources and discussion, see the resources and best practices for training models efficiently.""", - "Model Evaluation: Capabilities": """ - Many modern foundation models are released with general conversational abilities, such that their use cases are - poorly specified and open-ended. - This poses significant challenges to evaluation benchmarks which are unable to critically evaluate so many tasks, - applications, and risks systematically or fairly. - As a result, it is important to carefully scope the original intentions for the model, and the evaluations to those - intentions. - Even then, the most relevant evaluation benchmarks may not align with real use, and so should be qualified with their - limitations, and carefully supplemented with real user/human evaluation settings, where feasible. - - Below we note common benchmarks, as of December 2023, but caution that all of these come with substantial limitations. - For instance, many multiple choice college knowledge benchmarks are not indicative of real user questions, and can - be gamed with pseudo-data contamination. - Additionally, while leaderboards are exceedingly popular, model responses are often scored by other models, which - have implicit biases to model responses that are longer, and look similar to their own - (see [AlpacaFarm](https://arxiv.org/abs/2305.14387)).""", - "Model Evaluation: Risks & Harms Taxonomies": """ - Taxonomies provide a way of categorising, defining and understanding risks and hazards created through the use and - deployment of AI systems. Some taxonomies focus primarily on the types of interactions and uses that - *create* a risk of harm (often called "hazards") whereas others focus on the negative effects that - they lead to (often called "harms"). - Some taxonomies focus on existing issues, such as models that create hate speech or child abuse material, whereas - others are focused on longer term threats related to dangerous weapons development, cybersecurity, and military use. - These tend to focus on future model capabilities and their misuse. - Many taxonomies assess the available evidence for the risks and hazards, discuss their impact, and offer mitigation - strategies. - There is a substantial focus on text-only models and future work should consider paying more attention to multimodal - models.""", - "Model Evaluation: Risks & Harms": """ - Evaluations of risk serve multiple purposes: to identify if there are issues which need mitigation, to track the - success of any such mitigations, to document for other users of the model what risks are still present, and to - help make decisions related to model access and release. - Harm is highly contextual, so developers should consider the context in which their foundation model might be used - and evaluate the highest severity and most likely risks. - - To think through the possible risks, many taxonomies of harm have been created and provide good starting points. - Determining how to evaluate risk is also challenging, as there are risks and modalities with limited evaluation - coverage. The sample included below are a starting point for certain key areas, but we encourage developers to - browse the evaluation repository (linked below) to see if there is something more suited to their needs. - In addition to fixed benchmarks, an emergent approach to evaluation is using one model to evaluate another - (see [Red Teaming Language Models with Language Models](https://arxiv.org/abs/2202.03286)) and Anthropic's - [Constitutional AI](https://www.anthropic.com/news/constitutional-ai-harmlessness-from-ai-feedback) work.""", - "Model Documentation": """ - It is important to document models that are used and released. Even models and code released openly are important - to document thoroughly, in order to specify how to use the model, recommended and non-recommended use cases, - potential harms, state or justify decisions made during training, and more. - - Documenting models is important not just for responsible development, but also to enable other developers to - effectively build on a model. Models are not nearly as useful as artifacts if not properly documented. - - We include frequently-used standards for model documentation as well as tools for easy following of standards - and creation of documentation. """, - "Reproducibility": """ - Model releases often go accompanied with claims on evaluation performance, but those results are not always - reproducible, or can be misleading. - If code is not released, is not comprehensive, is difficult to run, or misses key details, this will cost the - scientific community time and effort to replicate and verify the claims. - Replication time will also slow progress, and discourage developers from adopting that resource over others. - - For these reasons, we strongly recommend carefully curating code, for model training, inference and evaluation. - Reproducible code begins with clear dependencies, versioning, and setup scripts, that are easy to adopt even if - the tools and frameworks are unfamiliar. - Clear documentation, code legibility and scripts for each entry point also improve ease of adoption. - Notably, Colab Notebooks provide shareable environment setup and execution tools. - These measures will significantly improves scientific reproducibility, and transparency.""", - "License Selection": """ - Foundation models, like software, are accompanied by licenses that determine how they may be distributed, used, - and repurposed. There are a variety of licenses to choose between for open foundation model developers, presenting - potential challenges for new developers. The table below includes resources that can help guide developers through - the process of selecting a specific license for their model as well as several examples of licenses that include - use restrictions. While licenses with use restrictions may be appropriate for certain types of models, in other - cases use restrictions can limit the ability of certain categories of stakeholders to re-use or adapt the models. - - Responsible AI Licenses in particular, including BigScience's Open RAIL and AI2's ImpACT Licenses, have seen - growing adoption, but also criticism of the difficulties they may pose even for well-intentioned actors seeking - to comply with their requirements---especially in commercial applications---and because their enforceability still - remains an open question - (see [AI Licensing Can't Balance "Open" with "Responsible"](https://katedowninglaw.com/2023/07/13/ai-licensing-cant-balance-open-with-responsible/)). - While they can provide a convenient way to help a developer - express their understanding of their model's limitations, in conjunction with a model card that outlines - in-scope and out-of-scope uses, adopters should also consider unintended consequences in limiting the scope of - the follow-up research that may be conducted with the licensed artifacts. Responsible AI licenses can act as a - useful norm-setting and self-reflection tool, but users should be aware of their limitations and potential downsides, - especially compared to established open-source software licenses.""", - "Usage Monitoring": """ - Some open foundation model developers attempt to monitor the usage of their models, whether by watermarking model - outputs or gating access to the model. - The table below includes resources related to usage monitoring, including examples of how to watermark content, - provide guidance on appropriate use, report adverse events associated with model use, and limit some forms of - access to models. - Several of these approaches have significant drawbacks: for example, there are no known robust watermarking - techniques for language models. As with many of the sections above, usage monitoring remains an area of active - research.""", -} - -BASE_DIR = Path(__file__).resolve().parent.parent diff --git a/app/src/theme/__init__.py b/app/src/theme/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/src/theme/style.css b/app/src/theme/style.css deleted file mode 100644 index 2030f4c..0000000 --- a/app/src/theme/style.css +++ /dev/null @@ -1,117 +0,0 @@ -:root { - /* Color variables */ - --color-b6: #265ED4; - --color-b7: #1A4CAE; - --color-b8: #1B4596; - --color-b10: #223367; - --color-n2: #F8F9FA; - /* semantic palette */ - --color-palette-primary-default: var(--color-b6); - --color-palette-primary-dark: var(--color-b7); - --color-palette-primary-veryDark: var(--color-b8); - - --color-palette-background-light: var(--color-n2); - --color-palette-background-dark: var(--color-b10); - - /* Breakpoints */ - --breakpoint-md: 768px; - - /* Z indices */ - /* Note this value was observed in their calendar component popover and is not officially documented anywhere. - * We are assuming that everything above this will appear over their popovers. - */ - --z-index-streamlit-popover: 1000040; - /* semantic z indices */ - /* Place the header just under popovers but above all other content */ - --z-index-header: calc(var(--z-index-streamlit-popover) - 1); -} - -/* Hide default streamlit header */ -header { - visibility: hidden; -} - -/* Custom AI2 header */ -.header { - background: var(--color-palette-background-dark); - color: var(--color-palette-background-light); - position: fixed; - top: 0; - left: 0; - width: 100%; - padding: .25em 0; - z-index: var(--z-index-header); -} - -.header>* { - margin-left: calc(18rem); -} - -@media only screen and (max-width: var(--breakpoint-md)) { - .header>* { - margin-left: 1em; - } -} - -/* Pushing streamlit controls below the custom AI2 banner */ -#MainMenu { - visibility: visible; - /* moving menu button below our custom header */ - margin-top: 4rem; -} - -section[data-testid="stSidebar"] { - margin-top: calc(2rem + 2px); -} - -section[data-testid="stSidebar"]>div:nth-child(2) { - top: calc(2.5rem); -} - -/* Button styling */ -button[kind="primary"] { - background-color: var(--color-palette-primary-dark); - color: var(--color-palette-background-light); -} - -button[kind="primary"]:hover, -button[kind="primary"]:active, -button[kind="primary"]:focus, -button[kind="primary"]:focus:not(:active) { - color: #fff; - text-decoration: none; - border-color: #3a69ba; - background: #3a69ba; -} - -.block-container { - padding-top: 0rem; - padding-bottom: 1rem; - padding-left: 4rem; - padding-right: 4rem; -} - -/* Hide default streamlit footer */ -footer { - visibility: hidden; -} - - -/* Hide borders for the table */ -table { - border: none!important; - /* Remove borders around the outside of the table */ -} - -/* Hide borders for table cells */ -th, td { - border: none!important; - border-style: none; - border-width: 0px; - /* Remove borders around table cells */ -} - - -#logo { - margin-bottom: 0rem; -} diff --git a/app/src/theme/theme.py b/app/src/theme/theme.py deleted file mode 100644 index 79dedc1..0000000 --- a/app/src/theme/theme.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -from os import path - -import streamlit as st - -from src.utils import get_environment - -__location__ = path.realpath(path.join(os.getcwd(), path.dirname(__file__))) - - -def add_theme(): - with open(os.path.join(__location__, "style.css")) as f: - st.markdown(f"", unsafe_allow_html=True) - # hide the streamlit menu if prod - if get_environment() != "dev": - st.markdown( - """ - - """, - unsafe_allow_html=True, - ) - # add the AI2 banner - add_header() - - -def add_header(): - ... - # st.write( - # '', - # unsafe_allow_html=True, - # ) - - -def add_footer(): - """ - Please don't edit or remove this footer as we'd like to include these important links on all AI2 applications - """ - st.markdown( - "" - "Website hosted by the Allen Institute for AI | " - " Privacy Policy | " - " Terms of Use | " - " Business Code of Conduct" - "
", - unsafe_allow_html=True, - ) diff --git a/app/src/utils.py b/app/src/utils.py deleted file mode 100644 index ae36e6b..0000000 --- a/app/src/utils.py +++ /dev/null @@ -1,86 +0,0 @@ -import base64 -import gzip -import os -from pathlib import Path - -import jsonlines -import pandas as pd -import streamlit as st - -from .constants import BASE_DIR - - -def read_jsonl(inpath: Path): - if inpath.suffix in (".gz", ".gzip"): - with gzip.open(inpath, "rb") as fp: - j_reader = jsonlines.Reader(fp) - return [l for l in j_reader] - else: - with open(inpath, "rb") as fp: - j_reader = jsonlines.Reader(fp) - return [l for l in j_reader] - - -@st.cache_data -def load_data(): - df = pd.DataFrame(read_jsonl(BASE_DIR / "resources" / "resources.jsonl")).fillna("") - logos = load_logos() - - def add_links(row): - links = [] - if row["Paper Link"]: - links.append(create_markdown_img(logos["arxiv"], row["Paper Link"], 20)) - if row["HuggingFace Link"]: - links.append(create_markdown_img(logos["hf"], row["HuggingFace Link"], 20)) - if row["GitHub Link"]: - links.append(create_markdown_img(logos["github"], row["GitHub Link"], 20)) - if row["Website Link"]: - links.append(create_markdown_img(logos["web"], row["Website Link"], 20)) - - return " ".join(links) - - df["Links"] = df.apply(add_links, axis=1) - - def add_modality(row): - return " ".join( - [ - create_markdown_img(logos[modality.lower()], None, 20) - for modality in row["Modalities"] - ] - ) - - df["Modality"] = df.apply(add_modality, axis=1) - - return df - - -def load_logos(): - def get_image_base64(image_path): - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode() - - return { - "hf": get_image_base64(BASE_DIR / "resources" / "logos/hf.png"), - "web": get_image_base64(BASE_DIR / "resources" / "logos/web.png"), - "arxiv": get_image_base64(BASE_DIR / "resources" / "logos/arxiv.png"), - "github": get_image_base64(BASE_DIR / "resources" / "logos/github.png"), - "text": get_image_base64(BASE_DIR / "resources" / "logos/text.png"), - "vision": get_image_base64(BASE_DIR / "resources" / "logos/vision.png"), - "speech": get_image_base64(BASE_DIR / "resources" / "logos/speech.png"), - "cheatsheet": get_image_base64( - BASE_DIR / "resources" / "logos/cheatsheet-0.png" - ), - } - - -@st.cache_data -def create_markdown_img(base64_string, link_url=None, dim=15): - img_tag = f'' - if link_url: - return f'{img_tag}' - else: - return img_tag - - -def get_environment(): - return os.getenv("STREAMLIT_ENV", "dev") diff --git a/app/resources/logos/arxiv.png b/app/static/app/resources/logos/arxiv.png similarity index 100% rename from app/resources/logos/arxiv.png rename to app/static/app/resources/logos/arxiv.png diff --git a/app/static/app/resources/logos/cheatsheet-0.png b/app/static/app/resources/logos/cheatsheet-0.png new file mode 100644 index 0000000..2d7eb96 Binary files /dev/null and b/app/static/app/resources/logos/cheatsheet-0.png differ diff --git a/app/resources/logos/github.png b/app/static/app/resources/logos/github.png similarity index 100% rename from app/resources/logos/github.png rename to app/static/app/resources/logos/github.png diff --git a/app/static/app/resources/logos/hf.png b/app/static/app/resources/logos/hf.png new file mode 100644 index 0000000..b89b723 Binary files /dev/null and b/app/static/app/resources/logos/hf.png differ diff --git a/app/resources/logos/logo.png b/app/static/app/resources/logos/logo.png similarity index 100% rename from app/resources/logos/logo.png rename to app/static/app/resources/logos/logo.png diff --git a/app/resources/logos/speech.png b/app/static/app/resources/logos/speech.png similarity index 100% rename from app/resources/logos/speech.png rename to app/static/app/resources/logos/speech.png diff --git a/app/resources/logos/text.png b/app/static/app/resources/logos/text.png similarity index 100% rename from app/resources/logos/text.png rename to app/static/app/resources/logos/text.png diff --git a/app/resources/logos/vision.png b/app/static/app/resources/logos/vision.png similarity index 100% rename from app/resources/logos/vision.png rename to app/static/app/resources/logos/vision.png diff --git a/app/resources/logos/web.png b/app/static/app/resources/logos/web.png similarity index 100% rename from app/resources/logos/web.png rename to app/static/app/resources/logos/web.png diff --git a/app/resources/orgs-nomlc.png b/app/static/app/resources/orgs-nomlc.png similarity index 100% rename from app/resources/orgs-nomlc.png rename to app/static/app/resources/orgs-nomlc.png diff --git a/app/resources/orgs.png b/app/static/app/resources/orgs.png similarity index 100% rename from app/resources/orgs.png rename to app/static/app/resources/orgs.png diff --git a/app/resources/paper.pdf b/app/static/app/resources/paper.pdf similarity index 100% rename from app/resources/paper.pdf rename to app/static/app/resources/paper.pdf diff --git a/app/static/foundation-model-development-cheatsheet.pdf b/app/static/foundation-model-development-cheatsheet.pdf new file mode 100644 index 0000000..8ab853d Binary files /dev/null and b/app/static/foundation-model-development-cheatsheet.pdf differ diff --git a/app/static/images/resource-icons/github-light.png b/app/static/images/resource-icons/github-light.png new file mode 100644 index 0000000..c11af58 Binary files /dev/null and b/app/static/images/resource-icons/github-light.png differ diff --git a/app/static/images/resource-icons/github.png b/app/static/images/resource-icons/github.png new file mode 100644 index 0000000..6bd03ac Binary files /dev/null and b/app/static/images/resource-icons/github.png differ diff --git a/app/static/images/resource-icons/huggingface-light.png b/app/static/images/resource-icons/huggingface-light.png new file mode 100644 index 0000000..ce2fb6d Binary files /dev/null and b/app/static/images/resource-icons/huggingface-light.png differ diff --git a/app/static/images/resource-icons/huggingface.png b/app/static/images/resource-icons/huggingface.png new file mode 100644 index 0000000..93dfa15 Binary files /dev/null and b/app/static/images/resource-icons/huggingface.png differ diff --git a/app/static/images/resource-icons/paper-dark.png b/app/static/images/resource-icons/paper-dark.png new file mode 100644 index 0000000..f38bc14 Binary files /dev/null and b/app/static/images/resource-icons/paper-dark.png differ diff --git a/app/static/images/resource-icons/paper-light.png b/app/static/images/resource-icons/paper-light.png new file mode 100644 index 0000000..1dfd4a7 Binary files /dev/null and b/app/static/images/resource-icons/paper-light.png differ diff --git a/app/static/images/resource-icons/paper.png b/app/static/images/resource-icons/paper.png new file mode 100644 index 0000000..ad3b68f Binary files /dev/null and b/app/static/images/resource-icons/paper.png differ diff --git a/app/static/images/resource-icons/web-light.png b/app/static/images/resource-icons/web-light.png new file mode 100644 index 0000000..e057b9b Binary files /dev/null and b/app/static/images/resource-icons/web-light.png differ diff --git a/app/static/images/resource-icons/web.png b/app/static/images/resource-icons/web.png new file mode 100644 index 0000000..361ace4 Binary files /dev/null and b/app/static/images/resource-icons/web.png differ diff --git a/app/static/robots.txt b/app/static/robots.txt new file mode 100644 index 0000000..f2576cf --- /dev/null +++ b/app/static/robots.txt @@ -0,0 +1,13 @@ +# If you would like to obtain the data behind this application, please contact +# The Allen Institute for AI: http://www.allenai.org +# +# Our models can require a significant amount of computation, and crawling the +# app comes at our expense and can even cause applications to fail. +# We will block unauthorized crawling of the app. +# +# Privacy Policy: https://allenai.org/privacy-policy.html +# Terms and Conditions of use: https://allenai.org/terms.html + +User-agent: * +Disallow: /*.pdf$ +Disallow: /*.json$ diff --git a/app/tailwind.config.js b/app/tailwind.config.js new file mode 100755 index 0000000..7b16680 --- /dev/null +++ b/app/tailwind.config.js @@ -0,0 +1,121 @@ +const fs = require("fs"); +const path = require("path"); +const themePath = path.join(__dirname, "data/theme.json"); +const themeRead = fs.readFileSync(themePath, "utf8"); +const theme = JSON.parse(themeRead); +const hugoStatsPath = path.join(__dirname, "hugo_stats.json"); +const hugoStatsRead = fs.readFileSync(hugoStatsPath, "utf8"); + +// Function to decode Unicode escape sequences +function decodeUnicode(json) { + return JSON.parse(json, (key, value) => { + if (typeof value === "string") { + return value.replace(/\\u[\dA-Fa-f]{4}/g, (match) => { + return String.fromCharCode(parseInt(match.slice(2), 16)); + }); + } + return value; + }); +} + +fs.writeFileSync( + "hugo_stats_decoded.json", + JSON.stringify(decodeUnicode(hugoStatsRead), null, 2), +); + +let font_base = Number(theme.fonts.font_size.base.replace("px", "")); +let font_scale = Number(theme.fonts.font_size.scale); +let h6 = font_base / font_base; +let h5 = h6 * font_scale; +let h4 = h5 * font_scale; +let h3 = h4 * font_scale; +let h2 = h3 * font_scale; +let h1 = h2 * font_scale; +let fontPrimary, fontPrimaryType, fontSecondary, fontSecondaryType; +if (theme.fonts.font_family.primary) { + fontPrimary = theme.fonts.font_family.primary + .replace(/\+/g, " ") + .replace(/:[ital,]*[ital@]*[wght@]*[0-9,;]+/gi, ""); + fontPrimaryType = theme.fonts.font_family.primary_type; +} +if (theme.fonts.font_family.secondary) { + fontSecondary = theme.fonts.font_family.secondary + .replace(/\+/g, " ") + .replace(/:[ital,]*[ital@]*[wght@]*[0-9,;]+/gi, ""); + fontSecondaryType = theme.fonts.font_family.secondary_type; +} + +/** @type {import('tailwindcss').Config} */ +module.exports = { + content: ["./hugo_stats_decoded.json"], + safelist: [{ pattern: /^swiper-/ }], + darkMode: "class", + theme: { + screens: { + sm: "240px", + md: "768px", + lg: "1024px", + xl: "1280px", + "2xl": "1536px", + }, + container: { + center: true, + padding: "2rem", + }, + extend: { + colors: { + text: theme.colors.default.text_color.default, + light: theme.colors.default.text_color.light, + dark: theme.colors.default.text_color.dark, + primary: theme.colors.default.theme_color.primary, + secondary: theme.colors.default.theme_color.secondary, + body: theme.colors.default.theme_color.body, + border: theme.colors.default.theme_color.border, + "theme-light": theme.colors.default.theme_color.theme_light, + "theme-dark": theme.colors.default.theme_color.theme_dark, + darkmode: { + text: theme.colors.darkmode.text_color.default, + light: theme.colors.darkmode.text_color.light, + dark: theme.colors.darkmode.text_color.dark, + primary: theme.colors.darkmode.theme_color.primary, + secondary: theme.colors.darkmode.theme_color.secondary, + body: theme.colors.darkmode.theme_color.body, + border: theme.colors.darkmode.theme_color.border, + "theme-light": theme.colors.darkmode.theme_color.theme_light, + "theme-dark": theme.colors.darkmode.theme_color.theme_dark, + }, + }, + fontSize: { + base: font_base + "px", + h1: h1 + "rem", + "h1-sm": h1 * 0.8 + "rem", + h2: h2 + "rem", + "h2-sm": h2 * 0.8 + "rem", + h3: h3 + "rem", + "h3-sm": h3 * 0.8 + "rem", + h4: h4 + "rem", + h5: h5 + "rem", + h6: h6 + "rem", + }, + fontFamily: { + primary: [fontPrimary, fontPrimaryType], + secondary: [fontSecondary, fontSecondaryType], + }, + }, + }, + plugins: [ + require("@tailwindcss/typography"), + require("@tailwindcss/forms"), + require("tailwind-bootstrap-grid")({ + generateContainer: false, + gridGutterWidth: "2rem", + gridGutters: { + 1: "0.25rem", + 2: "0.5rem", + 3: "1rem", + 4: "1.5rem", + 5: "3rem", + }, + }), + ], +}; diff --git a/app/theme.toml b/app/theme.toml new file mode 100644 index 0000000..0fee19f --- /dev/null +++ b/app/theme.toml @@ -0,0 +1,38 @@ +name = "Hugoplate (Modified by aimodels.org)" +license = "MIT" +licenselink = "https://github.com/zeon-studio/hugoplate/blob/main/LICENSE" +description = "Hugoplate is a free starter template built with Hugo, and TailwindCSS, providing everything you need to jumpstart your Hugo project and save valuable time." +homepage = "https://github.com/zeon-studio/hugoplate" +demosite = "https://hugoplate.netlify.app/" +min_version = "0.115.4" + +tags = [ + "blog", + "responsive", + "minimal", + "personal", + "light", + "dark", + "multilingual", + "landing", + "contact", + "dark mode", + "tailwindcss", +] + +features = [ + "Multi-Authors", + "Search", + "Multilingual", + "Dark Mode", + "Taxonomies", +] + +[author] +name = "Justin Riddiough" +homepage = "https://aimodels.org" + +[original] +author = "Zeon Studio" +homepage = "https://zeon.studio" +repo = "https://github.com/zeon-studio/themeplate" diff --git a/app/themes/hugoplate/assets/js/main.js b/app/themes/hugoplate/assets/js/main.js new file mode 100755 index 0000000..f05344e --- /dev/null +++ b/app/themes/hugoplate/assets/js/main.js @@ -0,0 +1,50 @@ +// main script +(function () { + "use strict"; + + // Dropdown Menu Toggler For Mobile + // ---------------------------------------- + const dropdownMenuToggler = document.querySelectorAll( + ".nav-dropdown > .nav-link", + ); + + dropdownMenuToggler.forEach((toggler) => { + toggler?.addEventListener("click", (e) => { + e.target.parentElement.classList.toggle("active"); + }); + }); + + // Testimonial Slider + // ---------------------------------------- + // new Swiper(".testimonial-slider", { + // spaceBetween: 24, + // loop: true, + // pagination: { + // el: ".testimonial-slider-pagination", + // type: "bullets", + // clickable: true, + // }, + // breakpoints: { + // 768: { + // slidesPerView: 2, + // }, + // 992: { + // slidesPerView: 3, + // }, + // }, + // }); + + // Audio Player Control + // ---------------------------------------- + const audioPlayers = document.querySelectorAll('.audio-player'); + + audioPlayers.forEach(player => { + player.addEventListener('play', function() { + audioPlayers.forEach(otherPlayer => { + if (otherPlayer !== player && !otherPlayer.paused) { + otherPlayer.pause(); + } + }); + }); + }); +})(); diff --git a/app/themes/hugoplate/assets/plugins/maps/google-map.js b/app/themes/hugoplate/assets/plugins/maps/google-map.js new file mode 100644 index 0000000..884b5b1 --- /dev/null +++ b/app/themes/hugoplate/assets/plugins/maps/google-map.js @@ -0,0 +1,179 @@ +/*!*************************************************** + * Google Map + *****************************************************/ + +window.marker = null; + +function initialize() { + var map, + mapId = document.getElementById("map"); + var latitude = mapId.getAttribute("data-latitude"); + var longitude = mapId.getAttribute("data-longitude"); + var mapMarker = mapId.getAttribute("data-marker"); + var mapMarkerName = mapId.getAttribute("data-marker-name"); + var nottingham = new google.maps.LatLng(latitude, longitude); + var style = [ + { + featureType: "administrative", + elementType: "all", + stylers: [ + { + saturation: "-100", + }, + ], + }, + { + featureType: "administrative.province", + elementType: "all", + stylers: [ + { + visibility: "off", + }, + ], + }, + { + featureType: "landscape", + elementType: "all", + stylers: [ + { + saturation: -100, + }, + { + lightness: 65, + }, + { + visibility: "on", + }, + ], + }, + { + featureType: "poi", + elementType: "all", + stylers: [ + { + saturation: -100, + }, + { + lightness: "50", + }, + { + visibility: "simplified", + }, + ], + }, + { + featureType: "road", + elementType: "all", + stylers: [ + { + saturation: "-100", + }, + ], + }, + { + featureType: "road.highway", + elementType: "all", + stylers: [ + { + visibility: "simplified", + }, + ], + }, + { + featureType: "road.arterial", + elementType: "all", + stylers: [ + { + lightness: "30", + }, + ], + }, + { + featureType: "road.local", + elementType: "all", + stylers: [ + { + lightness: "40", + }, + ], + }, + { + featureType: "transit", + elementType: "all", + stylers: [ + { + saturation: -100, + }, + { + visibility: "simplified", + }, + ], + }, + { + featureType: "water", + elementType: "geometry", + stylers: [ + { + hue: "#ffff00", + }, + { + lightness: -25, + }, + { + saturation: -97, + }, + ], + }, + { + featureType: "water", + elementType: "labels", + stylers: [ + { + lightness: -25, + }, + { + saturation: -100, + }, + ], + }, + ]; + var mapOptions = { + center: nottingham, + mapTypeId: google.maps.MapTypeId.ROADMAP, + backgroundColor: "#000", + zoom: 15, + panControl: !1, + zoomControl: !0, + mapTypeControl: !1, + scaleControl: !1, + streetViewControl: !1, + overviewMapControl: !1, + zoomControlOptions: { + style: google.maps.ZoomControlStyle.LARGE, + }, + }; + map = new google.maps.Map(document.getElementById("map"), mapOptions); + var mapType = new google.maps.StyledMapType(style, { + name: "Grayscale", + }); + map.mapTypes.set("grey", mapType); + map.setMapTypeId("grey"); + var marker_image = mapMarker; + var pinIcon = new google.maps.MarkerImage( + marker_image, + null, + null, + null, + new google.maps.Size(30, 50), + ); + marker = new google.maps.Marker({ + position: nottingham, + map: map, + icon: pinIcon, + title: mapMarkerName, + }); +} +var map = document.getElementById("map"); +if (map != null) { + google.maps.event.addDomListener(window, "load", initialize); +} diff --git a/app/themes/hugoplate/assets/plugins/swiper/swiper-bundle.css b/app/themes/hugoplate/assets/plugins/swiper/swiper-bundle.css new file mode 100644 index 0000000..6f0c194 --- /dev/null +++ b/app/themes/hugoplate/assets/plugins/swiper/swiper-bundle.css @@ -0,0 +1,667 @@ +/** + * Swiper 8.0.7 + * Most modern mobile touch slider and framework with hardware accelerated transitions + * https://swiperjs.com + * + * Copyright 2014-2022 Vladimir Kharlampidi + * + * Released under the MIT License + * + * Released on: March 4, 2022 + */ + +@font-face { + font-family: "swiper-icons"; + src: url("data:application/font-woff;charset=utf-8;base64, d09GRgABAAAAAAZgABAAAAAADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABGRlRNAAAGRAAAABoAAAAci6qHkUdERUYAAAWgAAAAIwAAACQAYABXR1BPUwAABhQAAAAuAAAANuAY7+xHU1VCAAAFxAAAAFAAAABm2fPczU9TLzIAAAHcAAAASgAAAGBP9V5RY21hcAAAAkQAAACIAAABYt6F0cBjdnQgAAACzAAAAAQAAAAEABEBRGdhc3AAAAWYAAAACAAAAAj//wADZ2x5ZgAAAywAAADMAAAD2MHtryVoZWFkAAABbAAAADAAAAA2E2+eoWhoZWEAAAGcAAAAHwAAACQC9gDzaG10eAAAAigAAAAZAAAArgJkABFsb2NhAAAC0AAAAFoAAABaFQAUGG1heHAAAAG8AAAAHwAAACAAcABAbmFtZQAAA/gAAAE5AAACXvFdBwlwb3N0AAAFNAAAAGIAAACE5s74hXjaY2BkYGAAYpf5Hu/j+W2+MnAzMYDAzaX6QjD6/4//Bxj5GA8AuRwMYGkAPywL13jaY2BkYGA88P8Agx4j+/8fQDYfA1AEBWgDAIB2BOoAeNpjYGRgYNBh4GdgYgABEMnIABJzYNADCQAACWgAsQB42mNgYfzCOIGBlYGB0YcxjYGBwR1Kf2WQZGhhYGBiYGVmgAFGBiQQkOaawtDAoMBQxXjg/wEGPcYDDA4wNUA2CCgwsAAAO4EL6gAAeNpj2M0gyAACqxgGNWBkZ2D4/wMA+xkDdgAAAHjaY2BgYGaAYBkGRgYQiAHyGMF8FgYHIM3DwMHABGQrMOgyWDLEM1T9/w8UBfEMgLzE////P/5//f/V/xv+r4eaAAeMbAxwIUYmIMHEgKYAYjUcsDAwsLKxc3BycfPw8jEQA/gZBASFhEVExcQlJKWkZWTl5BUUlZRVVNXUNTQZBgMAAMR+E+gAEQFEAAAAKgAqACoANAA+AEgAUgBcAGYAcAB6AIQAjgCYAKIArAC2AMAAygDUAN4A6ADyAPwBBgEQARoBJAEuATgBQgFMAVYBYAFqAXQBfgGIAZIBnAGmAbIBzgHsAAB42u2NMQ6CUAyGW568x9AneYYgm4MJbhKFaExIOAVX8ApewSt4Bic4AfeAid3VOBixDxfPYEza5O+Xfi04YADggiUIULCuEJK8VhO4bSvpdnktHI5QCYtdi2sl8ZnXaHlqUrNKzdKcT8cjlq+rwZSvIVczNiezsfnP/uznmfPFBNODM2K7MTQ45YEAZqGP81AmGGcF3iPqOop0r1SPTaTbVkfUe4HXj97wYE+yNwWYxwWu4v1ugWHgo3S1XdZEVqWM7ET0cfnLGxWfkgR42o2PvWrDMBSFj/IHLaF0zKjRgdiVMwScNRAoWUoH78Y2icB/yIY09An6AH2Bdu/UB+yxopYshQiEvnvu0dURgDt8QeC8PDw7Fpji3fEA4z/PEJ6YOB5hKh4dj3EvXhxPqH/SKUY3rJ7srZ4FZnh1PMAtPhwP6fl2PMJMPDgeQ4rY8YT6Gzao0eAEA409DuggmTnFnOcSCiEiLMgxCiTI6Cq5DZUd3Qmp10vO0LaLTd2cjN4fOumlc7lUYbSQcZFkutRG7g6JKZKy0RmdLY680CDnEJ+UMkpFFe1RN7nxdVpXrC4aTtnaurOnYercZg2YVmLN/d/gczfEimrE/fs/bOuq29Zmn8tloORaXgZgGa78yO9/cnXm2BpaGvq25Dv9S4E9+5SIc9PqupJKhYFSSl47+Qcr1mYNAAAAeNptw0cKwkAAAMDZJA8Q7OUJvkLsPfZ6zFVERPy8qHh2YER+3i/BP83vIBLLySsoKimrqKqpa2hp6+jq6RsYGhmbmJqZSy0sraxtbO3sHRydnEMU4uR6yx7JJXveP7WrDycAAAAAAAH//wACeNpjYGRgYOABYhkgZgJCZgZNBkYGLQZtIJsFLMYAAAw3ALgAeNolizEKgDAQBCchRbC2sFER0YD6qVQiBCv/H9ezGI6Z5XBAw8CBK/m5iQQVauVbXLnOrMZv2oLdKFa8Pjuru2hJzGabmOSLzNMzvutpB3N42mNgZGBg4GKQYzBhYMxJLMlj4GBgAYow/P/PAJJhLM6sSoWKfWCAAwDAjgbRAAB42mNgYGBkAIIbCZo5IPrmUn0hGA0AO8EFTQAA"); + font-weight: 400; + font-style: normal; +} +:root { + --swiper-theme-color: #007aff; +} +.swiper { + margin-left: auto; + margin-right: auto; + position: relative; + overflow: hidden; + list-style: none; + padding: 0; + /* Fix of Webkit flickering */ + z-index: 1; +} +.swiper-vertical > .swiper-wrapper { + flex-direction: column; +} +.swiper-wrapper { + position: relative; + width: 100%; + height: 100%; + z-index: 1; + display: flex; + transition-property: transform; + box-sizing: content-box; +} +.swiper-android .swiper-slide, +.swiper-wrapper { + transform: translate3d(0px, 0, 0); +} +.swiper-pointer-events { + touch-action: pan-y; +} +.swiper-pointer-events.swiper-vertical { + touch-action: pan-x; +} +.swiper-slide { + flex-shrink: 0; + width: 100%; + height: 100%; + position: relative; + transition-property: transform; +} +.swiper-slide-invisible-blank { + visibility: hidden; +} +/* Auto Height */ +.swiper-autoheight, +.swiper-autoheight .swiper-slide { + height: auto; +} +.swiper-autoheight .swiper-wrapper { + align-items: flex-start; + transition-property: transform, height; +} +.swiper-backface-hidden .swiper-slide { + transform: translateZ(0); + -webkit-backface-visibility: hidden; + backface-visibility: hidden; +} +/* 3D Effects */ +.swiper-3d, +.swiper-3d.swiper-css-mode .swiper-wrapper { + perspective: 1200px; +} +.swiper-3d .swiper-wrapper, +.swiper-3d .swiper-slide, +.swiper-3d .swiper-slide-shadow, +.swiper-3d .swiper-slide-shadow-left, +.swiper-3d .swiper-slide-shadow-right, +.swiper-3d .swiper-slide-shadow-top, +.swiper-3d .swiper-slide-shadow-bottom, +.swiper-3d .swiper-cube-shadow { + transform-style: preserve-3d; +} +.swiper-3d .swiper-slide-shadow, +.swiper-3d .swiper-slide-shadow-left, +.swiper-3d .swiper-slide-shadow-right, +.swiper-3d .swiper-slide-shadow-top, +.swiper-3d .swiper-slide-shadow-bottom { + position: absolute; + left: 0; + top: 0; + width: 100%; + height: 100%; + pointer-events: none; + z-index: 10; +} +.swiper-3d .swiper-slide-shadow { + background: rgba(0, 0, 0, 0.15); +} +.swiper-3d .swiper-slide-shadow-left { + background-image: linear-gradient( + to left, + rgba(0, 0, 0, 0.5), + rgba(0, 0, 0, 0) + ); +} +.swiper-3d .swiper-slide-shadow-right { + background-image: linear-gradient( + to right, + rgba(0, 0, 0, 0.5), + rgba(0, 0, 0, 0) + ); +} +.swiper-3d .swiper-slide-shadow-top { + background-image: linear-gradient( + to top, + rgba(0, 0, 0, 0.5), + rgba(0, 0, 0, 0) + ); +} +.swiper-3d .swiper-slide-shadow-bottom { + background-image: linear-gradient( + to bottom, + rgba(0, 0, 0, 0.5), + rgba(0, 0, 0, 0) + ); +} +/* CSS Mode */ +.swiper-css-mode > .swiper-wrapper { + overflow: auto; + scrollbar-width: none; + /* For Firefox */ + -ms-overflow-style: none; + /* For Internet Explorer and Edge */ +} +.swiper-css-mode > .swiper-wrapper::-webkit-scrollbar { + display: none; +} +.swiper-css-mode > .swiper-wrapper > .swiper-slide { + scroll-snap-align: start start; +} +.swiper-horizontal.swiper-css-mode > .swiper-wrapper { + scroll-snap-type: x mandatory; +} +.swiper-vertical.swiper-css-mode > .swiper-wrapper { + scroll-snap-type: y mandatory; +} +.swiper-centered > .swiper-wrapper::before { + content: ""; + flex-shrink: 0; + order: 9999; +} +.swiper-centered.swiper-horizontal + > .swiper-wrapper + > .swiper-slide:first-child { + margin-inline-start: var(--swiper-centered-offset-before); +} +.swiper-centered.swiper-horizontal > .swiper-wrapper::before { + height: 100%; + min-height: 1px; + width: var(--swiper-centered-offset-after); +} +.swiper-centered.swiper-vertical > .swiper-wrapper > .swiper-slide:first-child { + margin-block-start: var(--swiper-centered-offset-before); +} +.swiper-centered.swiper-vertical > .swiper-wrapper::before { + width: 100%; + min-width: 1px; + height: var(--swiper-centered-offset-after); +} +.swiper-centered > .swiper-wrapper > .swiper-slide { + scroll-snap-align: center center; +} +.swiper-virtual .swiper-slide { + -webkit-backface-visibility: hidden; + transform: translateZ(0); +} +.swiper-virtual.swiper-css-mode .swiper-wrapper::after { + content: ""; + position: absolute; + left: 0; + top: 0; + pointer-events: none; +} +.swiper-virtual.swiper-css-mode.swiper-horizontal .swiper-wrapper::after { + height: 1px; + width: var(--swiper-virtual-size); +} +.swiper-virtual.swiper-css-mode.swiper-vertical .swiper-wrapper::after { + width: 1px; + height: var(--swiper-virtual-size); +} +:root { + --swiper-navigation-size: 44px; + /* + --swiper-navigation-color: var(--swiper-theme-color); + */ +} +.swiper-button-prev, +.swiper-button-next { + position: absolute; + top: 50%; + width: calc(var(--swiper-navigation-size) / 44 * 27); + height: var(--swiper-navigation-size); + margin-top: calc(0px - (var(--swiper-navigation-size) / 2)); + z-index: 10; + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + color: var(--swiper-navigation-color, var(--swiper-theme-color)); +} +.swiper-button-prev.swiper-button-disabled, +.swiper-button-next.swiper-button-disabled { + opacity: 0.35; + cursor: auto; + pointer-events: none; +} +.swiper-button-prev:after, +.swiper-button-next:after { + font-family: swiper-icons; + font-size: var(--swiper-navigation-size); + text-transform: none !important; + letter-spacing: 0; + text-transform: none; + font-variant: initial; + line-height: 1; +} +.swiper-button-prev, +.swiper-rtl .swiper-button-next { + left: 10px; + right: auto; +} +.swiper-button-prev:after, +.swiper-rtl .swiper-button-next:after { + content: "prev"; +} +.swiper-button-next, +.swiper-rtl .swiper-button-prev { + right: 10px; + left: auto; +} +.swiper-button-next:after, +.swiper-rtl .swiper-button-prev:after { + content: "next"; +} +.swiper-button-lock { + display: none; +} +:root { + /* + --swiper-pagination-color: var(--swiper-theme-color); + --swiper-pagination-bullet-size: 8px; + --swiper-pagination-bullet-width: 8px; + --swiper-pagination-bullet-height: 8px; + --swiper-pagination-bullet-inactive-color: #000; + --swiper-pagination-bullet-inactive-opacity: 0.2; + --swiper-pagination-bullet-opacity: 1; + --swiper-pagination-bullet-horizontal-gap: 4px; + --swiper-pagination-bullet-vertical-gap: 6px; + */ +} +.swiper-pagination { + position: absolute; + text-align: center; + transition: 300ms opacity; + transform: translate3d(0, 0, 0); + z-index: 10; +} +.swiper-pagination.swiper-pagination-hidden { + opacity: 0; +} +/* Common Styles */ +.swiper-pagination-fraction, +.swiper-pagination-custom, +.swiper-horizontal > .swiper-pagination-bullets, +.swiper-pagination-bullets.swiper-pagination-horizontal { + bottom: 10px; + left: 0; + width: 100%; +} +/* Bullets */ +.swiper-pagination-bullets-dynamic { + overflow: hidden; + font-size: 0; +} +.swiper-pagination-bullets-dynamic .swiper-pagination-bullet { + transform: scale(0.33); + position: relative; +} +.swiper-pagination-bullets-dynamic .swiper-pagination-bullet-active { + transform: scale(1); +} +.swiper-pagination-bullets-dynamic .swiper-pagination-bullet-active-main { + transform: scale(1); +} +.swiper-pagination-bullets-dynamic .swiper-pagination-bullet-active-prev { + transform: scale(0.66); +} +.swiper-pagination-bullets-dynamic .swiper-pagination-bullet-active-prev-prev { + transform: scale(0.33); +} +.swiper-pagination-bullets-dynamic .swiper-pagination-bullet-active-next { + transform: scale(0.66); +} +.swiper-pagination-bullets-dynamic .swiper-pagination-bullet-active-next-next { + transform: scale(0.33); +} +.swiper-pagination-bullet { + width: var( + --swiper-pagination-bullet-width, + var(--swiper-pagination-bullet-size, 8px) + ); + height: var( + --swiper-pagination-bullet-height, + var(--swiper-pagination-bullet-size, 8px) + ); + display: inline-block; + border-radius: 50%; + background: var(--swiper-pagination-bullet-inactive-color, #000); + opacity: var(--swiper-pagination-bullet-inactive-opacity, 0.2); +} +button.swiper-pagination-bullet { + border: none; + margin: 0; + padding: 0; + box-shadow: none; + -webkit-appearance: none; + appearance: none; +} +.swiper-pagination-clickable .swiper-pagination-bullet { + cursor: pointer; +} +.swiper-pagination-bullet:only-child { + display: none !important; +} +.swiper-pagination-bullet-active { + opacity: var(--swiper-pagination-bullet-opacity, 1); + background: var(--swiper-pagination-color, var(--swiper-theme-color)); +} +.swiper-vertical > .swiper-pagination-bullets, +.swiper-pagination-vertical.swiper-pagination-bullets { + right: 10px; + top: 50%; + transform: translate3d(0px, -50%, 0); +} +.swiper-vertical > .swiper-pagination-bullets .swiper-pagination-bullet, +.swiper-pagination-vertical.swiper-pagination-bullets + .swiper-pagination-bullet { + margin: var(--swiper-pagination-bullet-vertical-gap, 6px) 0; + display: block; +} +.swiper-vertical > .swiper-pagination-bullets.swiper-pagination-bullets-dynamic, +.swiper-pagination-vertical.swiper-pagination-bullets.swiper-pagination-bullets-dynamic { + top: 50%; + transform: translateY(-50%); + width: 8px; +} +.swiper-vertical + > .swiper-pagination-bullets.swiper-pagination-bullets-dynamic + .swiper-pagination-bullet, +.swiper-pagination-vertical.swiper-pagination-bullets.swiper-pagination-bullets-dynamic + .swiper-pagination-bullet { + display: inline-block; + transition: + 200ms transform, + 200ms top; +} +.swiper-horizontal > .swiper-pagination-bullets .swiper-pagination-bullet, +.swiper-pagination-horizontal.swiper-pagination-bullets + .swiper-pagination-bullet { + margin: 0 var(--swiper-pagination-bullet-horizontal-gap, 4px); +} +.swiper-horizontal + > .swiper-pagination-bullets.swiper-pagination-bullets-dynamic, +.swiper-pagination-horizontal.swiper-pagination-bullets.swiper-pagination-bullets-dynamic { + left: 50%; + transform: translateX(-50%); + white-space: nowrap; +} +.swiper-horizontal + > .swiper-pagination-bullets.swiper-pagination-bullets-dynamic + .swiper-pagination-bullet, +.swiper-pagination-horizontal.swiper-pagination-bullets.swiper-pagination-bullets-dynamic + .swiper-pagination-bullet { + transition: + 200ms transform, + 200ms left; +} +.swiper-horizontal.swiper-rtl + > .swiper-pagination-bullets-dynamic + .swiper-pagination-bullet { + transition: + 200ms transform, + 200ms right; +} +/* Progress */ +.swiper-pagination-progressbar { + background: rgba(0, 0, 0, 0.25); + position: absolute; +} +.swiper-pagination-progressbar .swiper-pagination-progressbar-fill { + background: var(--swiper-pagination-color, var(--swiper-theme-color)); + position: absolute; + left: 0; + top: 0; + width: 100%; + height: 100%; + transform: scale(0); + transform-origin: left top; +} +.swiper-rtl .swiper-pagination-progressbar .swiper-pagination-progressbar-fill { + transform-origin: right top; +} +.swiper-horizontal > .swiper-pagination-progressbar, +.swiper-pagination-progressbar.swiper-pagination-horizontal, +.swiper-vertical + > .swiper-pagination-progressbar.swiper-pagination-progressbar-opposite, +.swiper-pagination-progressbar.swiper-pagination-vertical.swiper-pagination-progressbar-opposite { + width: 100%; + height: 4px; + left: 0; + top: 0; +} +.swiper-vertical > .swiper-pagination-progressbar, +.swiper-pagination-progressbar.swiper-pagination-vertical, +.swiper-horizontal + > .swiper-pagination-progressbar.swiper-pagination-progressbar-opposite, +.swiper-pagination-progressbar.swiper-pagination-horizontal.swiper-pagination-progressbar-opposite { + width: 4px; + height: 100%; + left: 0; + top: 0; +} +.swiper-pagination-lock { + display: none; +} +/* Scrollbar */ +.swiper-scrollbar { + border-radius: 10px; + position: relative; + -ms-touch-action: none; + background: rgba(0, 0, 0, 0.1); +} +.swiper-horizontal > .swiper-scrollbar { + position: absolute; + left: 1%; + bottom: 3px; + z-index: 50; + height: 5px; + width: 98%; +} +.swiper-vertical > .swiper-scrollbar { + position: absolute; + right: 3px; + top: 1%; + z-index: 50; + width: 5px; + height: 98%; +} +.swiper-scrollbar-drag { + height: 100%; + width: 100%; + position: relative; + background: rgba(0, 0, 0, 0.5); + border-radius: 10px; + left: 0; + top: 0; +} +.swiper-scrollbar-cursor-drag { + cursor: move; +} +.swiper-scrollbar-lock { + display: none; +} +.swiper-zoom-container { + width: 100%; + height: 100%; + display: flex; + justify-content: center; + align-items: center; + text-align: center; +} +.swiper-zoom-container > img, +.swiper-zoom-container > svg, +.swiper-zoom-container > canvas { + max-width: 100%; + max-height: 100%; + object-fit: contain; +} +.swiper-slide-zoomed { + cursor: move; +} +/* Preloader */ +:root { + /* + --swiper-preloader-color: var(--swiper-theme-color); + */ +} +.swiper-lazy-preloader { + width: 42px; + height: 42px; + position: absolute; + left: 50%; + top: 50%; + margin-left: -21px; + margin-top: -21px; + z-index: 10; + transform-origin: 50%; + box-sizing: border-box; + border: 4px solid var(--swiper-preloader-color, var(--swiper-theme-color)); + border-radius: 50%; + border-top-color: transparent; +} +.swiper-slide-visible .swiper-lazy-preloader { + animation: swiper-preloader-spin 1s infinite linear; +} +.swiper-lazy-preloader-white { + --swiper-preloader-color: #fff; +} +.swiper-lazy-preloader-black { + --swiper-preloader-color: #000; +} +@keyframes swiper-preloader-spin { + 100% { + transform: rotate(360deg); + } +} +/* a11y */ +.swiper .swiper-notification { + position: absolute; + left: 0; + top: 0; + pointer-events: none; + opacity: 0; + z-index: -1000; +} +.swiper-free-mode > .swiper-wrapper { + transition-timing-function: ease-out; + margin: 0 auto; +} +.swiper-grid > .swiper-wrapper { + flex-wrap: wrap; +} +.swiper-grid-column > .swiper-wrapper { + flex-wrap: wrap; + flex-direction: column; +} +.swiper-fade.swiper-free-mode .swiper-slide { + transition-timing-function: ease-out; +} +.swiper-fade .swiper-slide { + pointer-events: none; + transition-property: opacity; +} +.swiper-fade .swiper-slide .swiper-slide { + pointer-events: none; +} +.swiper-fade .swiper-slide-active, +.swiper-fade .swiper-slide-active .swiper-slide-active { + pointer-events: auto; +} +.swiper-cube { + overflow: visible; +} +.swiper-cube .swiper-slide { + pointer-events: none; + -webkit-backface-visibility: hidden; + backface-visibility: hidden; + z-index: 1; + visibility: hidden; + transform-origin: 0 0; + width: 100%; + height: 100%; +} +.swiper-cube .swiper-slide .swiper-slide { + pointer-events: none; +} +.swiper-cube.swiper-rtl .swiper-slide { + transform-origin: 100% 0; +} +.swiper-cube .swiper-slide-active, +.swiper-cube .swiper-slide-active .swiper-slide-active { + pointer-events: auto; +} +.swiper-cube .swiper-slide-active, +.swiper-cube .swiper-slide-next, +.swiper-cube .swiper-slide-prev, +.swiper-cube .swiper-slide-next + .swiper-slide { + pointer-events: auto; + visibility: visible; +} +.swiper-cube .swiper-slide-shadow-top, +.swiper-cube .swiper-slide-shadow-bottom, +.swiper-cube .swiper-slide-shadow-left, +.swiper-cube .swiper-slide-shadow-right { + z-index: 0; + -webkit-backface-visibility: hidden; + backface-visibility: hidden; +} +.swiper-cube .swiper-cube-shadow { + position: absolute; + left: 0; + bottom: 0px; + width: 100%; + height: 100%; + opacity: 0.6; + z-index: 0; +} +.swiper-cube .swiper-cube-shadow:before { + content: ""; + background: #000; + position: absolute; + left: 0; + top: 0; + bottom: 0; + right: 0; + filter: blur(50px); +} +.swiper-flip { + overflow: visible; +} +.swiper-flip .swiper-slide { + pointer-events: none; + -webkit-backface-visibility: hidden; + backface-visibility: hidden; + z-index: 1; +} +.swiper-flip .swiper-slide .swiper-slide { + pointer-events: none; +} +.swiper-flip .swiper-slide-active, +.swiper-flip .swiper-slide-active .swiper-slide-active { + pointer-events: auto; +} +.swiper-flip .swiper-slide-shadow-top, +.swiper-flip .swiper-slide-shadow-bottom, +.swiper-flip .swiper-slide-shadow-left, +.swiper-flip .swiper-slide-shadow-right { + z-index: 0; + -webkit-backface-visibility: hidden; + backface-visibility: hidden; +} +.swiper-creative .swiper-slide { + -webkit-backface-visibility: hidden; + backface-visibility: hidden; + overflow: hidden; + transition-property: transform, opacity, height; +} +.swiper-cards { + overflow: visible; +} +.swiper-cards .swiper-slide { + transform-origin: center bottom; + -webkit-backface-visibility: hidden; + backface-visibility: hidden; + overflow: hidden; +} diff --git a/app/themes/hugoplate/assets/plugins/swiper/swiper-bundle.js b/app/themes/hugoplate/assets/plugins/swiper/swiper-bundle.js new file mode 100644 index 0000000..9c90ea5 --- /dev/null +++ b/app/themes/hugoplate/assets/plugins/swiper/swiper-bundle.js @@ -0,0 +1,11853 @@ +/** + * Swiper 8.0.7 + * Most modern mobile touch slider and framework with hardware accelerated transitions + * https://swiperjs.com + * + * Copyright 2014-2022 Vladimir Kharlampidi + * + * Released under the MIT License + * + * Released on: March 4, 2022 + */ + +(function (global, factory) { + typeof exports === "object" && typeof module !== "undefined" + ? (module.exports = factory()) + : typeof define === "function" && define.amd + ? define(factory) + : ((global = + typeof globalThis !== "undefined" ? globalThis : global || self), + (global.Swiper = factory())); +})(this, function () { + "use strict"; + + /** + * SSR Window 4.0.2 + * Better handling for window object in SSR environment + * https://github.com/nolimits4web/ssr-window + * + * Copyright 2021, Vladimir Kharlampidi + * + * Licensed under MIT + * + * Released on: December 13, 2021 + */ + + /* eslint-disable no-param-reassign */ + function isObject$1(obj) { + return ( + obj !== null && + typeof obj === "object" && + "constructor" in obj && + obj.constructor === Object + ); + } + + function extend$1(target, src) { + if (target === void 0) { + target = {}; + } + + if (src === void 0) { + src = {}; + } + + Object.keys(src).forEach((key) => { + if (typeof target[key] === "undefined") target[key] = src[key]; + else if ( + isObject$1(src[key]) && + isObject$1(target[key]) && + Object.keys(src[key]).length > 0 + ) { + extend$1(target[key], src[key]); + } + }); + } + + const ssrDocument = { + body: {}, + + addEventListener() {}, + + removeEventListener() {}, + + activeElement: { + blur() {}, + + nodeName: "", + }, + + querySelector() { + return null; + }, + + querySelectorAll() { + return []; + }, + + getElementById() { + return null; + }, + + createEvent() { + return { + initEvent() {}, + }; + }, + + createElement() { + return { + children: [], + childNodes: [], + style: {}, + + setAttribute() {}, + + getElementsByTagName() { + return []; + }, + }; + }, + + createElementNS() { + return {}; + }, + + importNode() { + return null; + }, + + location: { + hash: "", + host: "", + hostname: "", + href: "", + origin: "", + pathname: "", + protocol: "", + search: "", + }, + }; + + function getDocument() { + const doc = typeof document !== "undefined" ? document : {}; + extend$1(doc, ssrDocument); + return doc; + } + + const ssrWindow = { + document: ssrDocument, + navigator: { + userAgent: "", + }, + location: { + hash: "", + host: "", + hostname: "", + href: "", + origin: "", + pathname: "", + protocol: "", + search: "", + }, + history: { + replaceState() {}, + + pushState() {}, + + go() {}, + + back() {}, + }, + CustomEvent: function CustomEvent() { + return this; + }, + + addEventListener() {}, + + removeEventListener() {}, + + getComputedStyle() { + return { + getPropertyValue() { + return ""; + }, + }; + }, + + Image() {}, + + Date() {}, + + screen: {}, + + setTimeout() {}, + + clearTimeout() {}, + + matchMedia() { + return {}; + }, + + requestAnimationFrame(callback) { + if (typeof setTimeout === "undefined") { + callback(); + return null; + } + + return setTimeout(callback, 0); + }, + + cancelAnimationFrame(id) { + if (typeof setTimeout === "undefined") { + return; + } + + clearTimeout(id); + }, + }; + + function getWindow() { + const win = typeof window !== "undefined" ? window : {}; + extend$1(win, ssrWindow); + return win; + } + + /** + * Dom7 4.0.4 + * Minimalistic JavaScript library for DOM manipulation, with a jQuery-compatible API + * https://framework7.io/docs/dom7.html + * + * Copyright 2022, Vladimir Kharlampidi + * + * Licensed under MIT + * + * Released on: January 11, 2022 + */ + /* eslint-disable no-proto */ + + function makeReactive(obj) { + const proto = obj.__proto__; + Object.defineProperty(obj, "__proto__", { + get() { + return proto; + }, + + set(value) { + proto.__proto__ = value; + }, + }); + } + + class Dom7 extends Array { + constructor(items) { + if (typeof items === "number") { + super(items); + } else { + super(...(items || [])); + makeReactive(this); + } + } + } + + function arrayFlat(arr) { + if (arr === void 0) { + arr = []; + } + + const res = []; + arr.forEach((el) => { + if (Array.isArray(el)) { + res.push(...arrayFlat(el)); + } else { + res.push(el); + } + }); + return res; + } + + function arrayFilter(arr, callback) { + return Array.prototype.filter.call(arr, callback); + } + + function arrayUnique(arr) { + const uniqueArray = []; + + for (let i = 0; i < arr.length; i += 1) { + if (uniqueArray.indexOf(arr[i]) === -1) uniqueArray.push(arr[i]); + } + + return uniqueArray; + } + + function qsa(selector, context) { + if (typeof selector !== "string") { + return [selector]; + } + + const a = []; + const res = context.querySelectorAll(selector); + + for (let i = 0; i < res.length; i += 1) { + a.push(res[i]); + } + + return a; + } + + function $(selector, context) { + const window = getWindow(); + const document = getDocument(); + let arr = []; + + if (!context && selector instanceof Dom7) { + return selector; + } + + if (!selector) { + return new Dom7(arr); + } + + if (typeof selector === "string") { + const html = selector.trim(); + + if (html.indexOf("<") >= 0 && html.indexOf(">") >= 0) { + let toCreate = "div"; + if (html.indexOf("+ The page you are looking for might have been removed, had its name + changed, or is temporarily unavailable. +
+{{ $entry.description | markdownify }}
+ + {{ with $entry.license }}License: {{ . }}
{{ end }} + + + +{{ $faq.acceptedAnswer.text | markdownify }}
+{{ .Params.Details }}
++ This cheatsheet aims to make it easier for new community members to understand our resources, tools, and + findings. We emphasize good practices, understanding limitations, and responsible use of these resources.
+Note
+These are the different parts of the AI development process, and we have a corresponding cheatsheet + section on each.
+Assembled by open model developers from AI2, EleutherAI, Google, + Hugging Face, Masakhane, MIT, MLCommons, Princeton, Stanford CRFM, University of California Santa Barbara + (UCSB), Univesity College London (UCL) and University of Washington (UW).
+ +A foundation model is a type of large language model (LLM) or other AI model that has + been trained on a massive amount of data. These models have a wide range of capabilities, from generating + realistic text to translating languages and creating different forms of creative content.
+Building a foundation model is a complex task. The cheatsheet has sections on "Data + Search, Analysis, & Exploration" tools to help you find and understand training data, as well as "Model + Training" sections on repositories and efficiency techniques.
+The quality and diversity of your data greatly impact your model. "Pretraining Data + Sources" and "Finetuning Data Catalogs" sections of the cheatsheet explain factors to consider like + language coverage, data source mixes, and the importance of clear documentation.
+Yes, the computational power needed for training foundation models has a significant + carbon footprint. The cheatsheet includes an "Environmental Impact" section with tools and methodologies + to estimate and responsibly manage this impact.
+The cheatsheet covers responsible AI practices extensively. Key points include: + Thorough model documentation ("Model Documentation" section), Evaluating risks and potential harms ("Model + Evaluation" sections), Careful license selection for your model ("License Selection" section), and + Potentially monitoring model usage ("Usage Monitoring" section).
+The cheatsheet is actively maintained and encourages community contributions. It's + designed to be a starting point, with many sections linking to in-depth resources and repositories.
++ {{ .Summary }} +
+ +{{ .Summary }}
+{{ .Description }}
+{{ .Description }}
+Click on the corresponding development section to view foundation model resources.
+{{ .Params.short_name }}
+ diff --git a/app/themes/hugoplate/layouts/partials/components/section-details.html b/app/themes/hugoplate/layouts/partials/components/section-details.html new file mode 100644 index 0000000..f62e8c0 --- /dev/null +++ b/app/themes/hugoplate/layouts/partials/components/section-details.html @@ -0,0 +1,20 @@ +{{/* partials/section-list.html */}} +{{ $sections := getJSON "assets/sections.json" }} + +{{ range $index, $section := $sections.sections }} +{{ . }}
{{ $entry.description | markdownify }}
+ + {{ with $entry.license }} +License: {{ . }}
+ {{ end }} + +