style: define experimental guidelines (#168)

bentoml · Jul 31, 2023 · 8c2867d · 8c2867d
1 parent 2c2070f
commit 8c2867d
Show file tree

Hide file tree

Showing 128 changed files with 8,762 additions and 9,920 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,6 +32,12 @@ repos:
         types: [python]
         exclude: ^(docs|tools|tests)
         args: [--config=pyproject.toml]
+  - repo: https://github.com/google/yapf
+    rev: v0.40.1
+    hooks:
+      - id: yapf
+        types: [python]
+        args: [--parallel, --recursive]
   - repo: local
     hooks:
       - id: mypy

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -155,8 +155,13 @@ hatch run tests:snapshot-models
 
 ## Working with Git
 
-To filter out most of the generated commits for infrastructure, use ``--invert-grep`` in conjunction with ``--grep``
-to filter out all commits with regex `"[generated]"`
+To filter out most of the generated commits for infrastructure, use
+`--invert-grep` in conjunction with `--grep` to filter out all commits with
+regex `"[generated]"`
+
+## Style
+
+See [STYLE.md](STYLE.md) for our style guide.
 
 ## Releasing a New Version
 

diff --git a/README.md b/README.md
@@ -19,6 +19,8 @@
         <img src="https://img.shields.io/pypi/pyversions/openllm.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
     </a><a href="https://github.com/pypa/hatch">
         <img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
+    </a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
+        <img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
     </a><a href="https://github.com/astral-sh/ruff">
         <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
     </a><a href="https://github.com/python/mypy">

diff --git a/STYLE.md b/STYLE.md
@@ -0,0 +1,160 @@
+## the coding style
+
+This documentation serves as a brief discussion of the coding style used for
+OpenLLM. As you have noticed, it is different from the conventional
+[PEP8](https://peps.python.org/pep-0008/) style as across many Python projects.
+The manifestation of OpenLLM code style is a combination of
+[Google Python Style](https://google.github.io/styleguide/pyguide.html),
+inspiration from coding language such as APL, Haskell, and is designed for fast,
+experimental development and prototyping.
+
+Everyone always has their own opinions on style. I believe this is exemplified
+further within the Python community, as it tries to be beginner-friendly, and
+therefore most people hold a very strong opinion on styling. I don't have a
+strong opinion on style either (I don't have any issue with PEP8, as we use it
+for our other projects), as long as:
+
+- You don't use any linter, formatter that change the style drastically other
+  than what specified within the projects' [`pyproject.toml`](./pyproject.toml).
+- The code you contribute is not widely different from the style of the code
+  surrounding it.
+
+With that being said, I want to use this project as a playground the explore a
+style that is both: "feels natural" and expressive for mathematical reasoning. I
+hope that you find this guide somewhat thought-provoking and interesting, that
+you can iterate and try to adopt some of them as part of the process
+contributing to the library.
+
+While PEP8 is a great base for a style guide, I find it to be having way too
+much white spaces and makes the code feels 'robotic'. Having a deterministic
+style and formatter is great to reduce the overhead of stylistic discussions,
+but I think it is important to write code that express the intent of reasoning.
+(_The policy here is definitely not "shovel everything into one line", but
+rather "compact and flowing"_)
+
+The styling is heavily inspired by
+[Kenneth Iverson's](https://en.wikipedia.org/wiki/Kenneth_E._Iverson) 1979
+Turing award lecture,
+[Notation as a Tool of Thought](https://www.eecg.toronto.edu/~jzhu/csc326/readings/iverson.pdf),
+and a lot of the stylistic inspiration comes from
+[Jeremy Howard's](https://jeremy.fast.ai/) [fastai](https://docs.fast.ai/). One
+thing that has been stuck with me ever since is the idea of "brevity facilitates
+reasoning", as such the tersity of style aren't just for the sake of shortness,
+rather the brevity of expression. (it enables
+[expository programming](http://archive.vector.org.uk/art10000980), combining
+with prototyping new ideas and logics within models implementation)
+
+## some guidelines
+
+Though I have stopped using deterministic formatter and linter, I do understand
+that people have preferences for using these tools, and it plays nicely with IDE
+and editors. As such, I included a [`pyproject.toml`](./pyproject.toml) file
+that specifies some configuration for the tools that makes it compiliant with
+the repository's style. In short, some of the tools include `ruff`, `yapf`, and
+`interrogate`. Since we manage everything via `hatch`, refer back to the
+[DEVELOPMENT.md](./DEVELOPMENT.md) for more information on this.
+
+Overtime, Python has incorporated a lot of features that supports this style of
+coding, including list comprehension, generator expression, lambda, array-based
+programming. Yet, Python will remain verbose per se, and the goal is that to
+make code fit nicely on a screen, and we don't have to always scroll downwards.
+
+While brevity is important, it is also important to make sure functions are
+somewhat, type-safe. Since there is no real type-safety when working with
+Python, typing should be a best-effort to make sure we don't introduce too many
+bugs.
+
+### naming
+
+- follow Python standard for this, I don't have too much opinion on this. Just
+  make sure that it is descriptive, and the abbreviation describes the intent of
+  the variable. i.e: `to_gpu` instead of `t_gpu`, `to_cpu` instead of `t_cpu`.
+- any math-related notation or neural net layers should be expressive and stay
+  close to the paper as much as possible. For example: `lm_head.weight` instead
+  of `lm_head.w`. Espically for implementing custom kernels and layers, it is
+  crucial to follow its nomenclature. E.g: `conv1` instead of
+  `first_conv_layer`.
+
+_If you have any suggestions, feel free to give it on our discord server!_
+
+### layout
+
+- Preferably not a lot of whitespaces, but rather flowing. If you can fit
+  everything for `if`, `def` or a `return` within one line, then there's no need
+  to break it into multiple line:
+
+  ```python
+  def foo(x): return rotate_cv(x) if x > 0 else -x
+  ```
+
+- imports should be grouped by their types, and each import should be designated
+  on its own line.
+
+  ```python
+  import os
+  import sys
+  ```
+  This is partially to make it easier to work with merge-conflicts, and easier
+  for IDE to navigate context definition.
+
+- indent with 2 spaces, which follow the Google codestyle.
+
+- With regards to writing operator, try to follow the domain-specific notation.
+  I.e: when writing pathlib, just don't add space since that is not how you
+  write a path in the terminal. `yapf` will try to accommodate some of this
+  changes.
+
+- Avoid trailing whitespace
+
+- use array, pytorch or numpy-based indexing where possible.
+
+- If you need to export anything, put it in `__all__` or do lazy export for
+  type-safe checker.
+
+### misc
+
+- import alias should be concise and descriptive. A convention is to always
+  `import typing as t`.
+- Writing docstring when it is possible. No need to comment everything asn it
+  makes the codebase hard to read. For docstring, follow the Google style guide.
+- We do lazy imports, so consult some of the `__init__.py` to see how we do it.
+- Documentation is still _working-in-progress_, but tldr it will be written in
+  MDX and will be hosted on the GitHub Pages, so stay tuned!
+- If anything that is not used for runtime, just put it under `t.TYPE_CHECKING`
+
+### note on codegen
+
+- We also do some codegen for some of the assignment functions. These logics are
+  largely based on the work of [attrs](https://github.com/python-attrs/attrs) to
+  ensure fast and isolated codegen in Python. If you need codegen but don't know
+  how it works, feel free to mention @aarnphm_ on discord!
+
+## FAQ
+
+### Why not use `black`?
+
+`black` is used on our other projects, but I rather find `black` to be very
+verbose and overtime it is annoying to work with too much whitespaces.
+
+### Why not PEP8?
+
+PEP8 is great if you are writing library such as this, but I'm going to do a lot
+of experimenting for implementing papers, so I decided early on that PEP8 is
+probably not fit here, and want to explore more expressive style.
+
+### Editor is complaining about the style, what should I do?
+
+Kindly ask you to disable linting for this project 🤗. I will try my best to
+accomodate with ruff and yapf, but I don't want to spend too much time on this.
+It is pretty stragithforward to disable it in your editor, with google.
+
+### Style might put off new contributors?
+
+I don't think so, as mentioned before, I don't have too much opinion on style as
+long as it somewhat follow what I have described above or the style of the code
+surrounding it. I will still accept styles PR as long as it is not too drastic.
+Just make sure to add the revision to `.git-blame-ignore-revs` so that
+`git blame` would work correctly.
+
+As for people who are too close-minded about styling, such individuals aren't
+the ones we want to work with anyway!
diff --git a/changelog.d/168.chore.md b/changelog.d/168.chore.md
@@ -0,0 +1,3 @@
+Define specific style guideline for the project. See
+[STYLE.md](https://github.com/bentoml/OpenLLM/blob/main/STYLE.md) for more
+information.
diff --git a/examples/bentoml-demo/service.py b/examples/bentoml-demo/service.py
@@ -24,13 +24,11 @@
 
 svc = bentoml.Service(name="llm-service", runners=[llm_runner])
 
-
 @svc.on_startup
 def download(_: bentoml.Context):
-    llm_runner.download_model()
-
+  llm_runner.download_model()
 
 @svc.api(input=bentoml.io.Text(), output=bentoml.io.Text())
 async def prompt(input_text: str) -> str:
-    answer = await llm_runner.generate.async_run(input_text)
-    return answer[0]["generated_text"]
+  answer = await llm_runner.generate.async_run(input_text)
+  return answer[0]["generated_text"]
diff --git a/examples/langchain-chains-demo/service.py b/examples/langchain-chains-demo/service.py
@@ -25,23 +25,20 @@
 from bentoml.io import Text
 
 class Query(BaseModel):
-    industry: str
-    product_name: str
-    keywords: t.List[str]
-    llm_config: t.Dict[str, t.Any]
-
+  industry: str
+  product_name: str
+  keywords: t.List[str]
+  llm_config: t.Dict[str, t.Any]
 
 def gen_llm(model_name: str, model_id: str | None = None) -> OpenLLM:
-    lc_llm = OpenLLM(model_name=model_name, model_id=model_id, embedded=False)
-    lc_llm.runner.download_model()
-    return lc_llm
-
+  lc_llm = OpenLLM(model_name=model_name, model_id=model_id, embedded=False)
+  lc_llm.runner.download_model()
+  return lc_llm
 
 llm = gen_llm("dolly-v2", model_id="databricks/dolly-v2-7b")
 
 prompt = PromptTemplate(
-    input_variables=["industry", "product_name", "keywords"],
-    template="""
+    input_variables=["industry", "product_name", "keywords"], template="""
 You are a Facebook Ads Copywriter with a strong background in persuasive
 writing and marketing. You craft compelling copy that appeals to the target
 audience's emotions and needs, peruading them to take action or make a
@@ -59,22 +56,12 @@ def gen_llm(model_name: str, model_id: str | None = None) -> OpenLLM:
 
 svc = bentoml.Service("fb-ads-copy", runners=[llm.runner])
 
-
 @svc.on_startup
 def download(_: bentoml.Context):
-    llm.runner.download_model()
-
-
-SAMPLE_INPUT = Query(
-    industry="SAAS",
-    product_name="BentoML",
-    keywords=["open source", "developer tool", "AI application platform", "serverless", "cost-efficient"],
-    llm_config=llm.runner.config.model_dump(),
-)
+  llm.runner.download_model()
 
+SAMPLE_INPUT = Query(industry="SAAS", product_name="BentoML", keywords=["open source", "developer tool", "AI application platform", "serverless", "cost-efficient"], llm_config=llm.runner.config.model_dump(),)
 
 @svc.api(input=JSON.from_sample(sample=SAMPLE_INPUT), output=Text())
 def generate(query: Query):
-    return chain.run(
-        {"industry": query.industry, "product_name": query.product_name, "keywords": ", ".join(query.keywords)}
-    )
+  return chain.run({"industry": query.industry, "product_name": query.product_name, "keywords": ", ".join(query.keywords)})
diff --git a/examples/langchain-tools-demo/service.py b/examples/langchain-tools-demo/service.py
@@ -22,16 +22,11 @@
 
 SAMPLE_INPUT = "What is the weather in San Francisco?"
 
-llm = OpenLLM(
-    model_name="dolly-v2",
-    model_id="databricks/dolly-v2-7b",
-    embedded=False,
-)
+llm = OpenLLM(model_name="dolly-v2", model_id="databricks/dolly-v2-7b", embedded=False,)
 tools = load_tools(["serpapi"], llm=llm)
 agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION)
 svc = bentoml.Service("langchain-openllm", runners=[llm.runner])
 
-
 @svc.api(input=Text.from_sample(sample=SAMPLE_INPUT), output=Text())
 def chat(input_text: str):
-    return agent.run(input_text)
+  return agent.run(input_text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -205,6 +205,7 @@ ignore = [
     "PLR0915",
     "PLR2004", # magic value to use constant
     "E501",    # ignore line length violation
+    "E702",
     "PYI021",  # ignore docstring in stubs, as pyright will include docstring in stubs.
     "D103",    # Just missing docstring for magic methods.
     "D102",
@@ -262,6 +263,49 @@ avoid-escape = false
 ]
 "typings/**/*" = ["D", "F", "E", "PYI002"]
 
+[tool.yapf]
+ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true
+ALLOW_MULTILINE_DICTIONARY_KEYS = false
+ALLOW_MULTILINE_LAMBDAS = false
+ALLOW_SPLIT_BEFORE_DEFAULT_OR_NAMED_ASSIGNS = false
+ALLOW_SPLIT_BEFORE_DICT_VALUE = false
+ARITHMETIC_PRECEDENCE_INDICATION = true
+BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1
+BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1
+BLANK_LINE_BEFORE_CLASS_DOCSTRING = false
+BLANK_LINE_BEFORE_MODULE_DOCSTRING = false
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false
+COALESCE_BRACKETS = true
+COLUMN_LIMIT = 384
+CONTINUATION_ALIGN_STYLE = "VALIGN-RIGHT"
+DEDENT_CLOSING_BRACKETS = true
+DISABLE_ENDING_COMMA_HEURISTIC = true
+EACH_DICT_ENTRY_ON_SEPARATE_LINE = false
+INDENT_BLANK_LINES = false
+INDENT_CLOSING_BRACKETS = false
+INDENT_WIDTH = 2
+JOIN_MULTIPLE_LINES = true
+NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS = true
+SPACES_AROUND_SUBSCRIPT_COLON = false
+SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false
+SPACE_INSIDE_BRACKETS = false
+SPLIT_ALL_COMMA_SEPARATED_VALUES = false
+SPLIT_ALL_TOP_LEVEL_COMMA_SEPARATED_VALUES = false
+SPLIT_ARGUMENTS_WHEN_COMMA_TERMINATED = false
+SPLIT_BEFORE_BITWISE_OPERATOR = false
+SPLIT_BEFORE_CLOSING_BRACKET = false
+SPLIT_BEFORE_DICT_SET_GENERATOR = false
+SPLIT_BEFORE_DOT = false
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = false
+SPLIT_BEFORE_FIRST_ARGUMENT = false
+SPLIT_BEFORE_LOGICAL_OPERATOR = false
+SPLIT_BEFORE_NAMED_ASSIGNS = false
+SPLIT_COMPLEX_COMPREHENSION = false
+SPLIT_PENALTY_AFTER_OPENING_BRACKET = 10000
+SPLIT_PENALTY_BEFORE_IF_EXPR = 10000
+SPLIT_PENALTY_COMPREHENSION = 3000
+SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT = 8000
+
 [tool.coverage.paths]
 openllm = ["src/openllm", "*/openllm/src/openllm"]
 [tool.coverage.run]