From 39197b5f54cd84ff35022c851dd2dcb753ca6b89 Mon Sep 17 00:00:00 2001 From: Michael Tu Date: Mon, 15 Jun 2020 22:02:01 -0700 Subject: [PATCH] Initial code release --- .editorconfig | 20 + .gitignore | 8 + ACKNOWLEDGEMENTS | 297 +++++++++++++ CODE_OF_CONDUCT.md | 71 ++++ CONTRIBUTING.md | 7 + DATASETS.txt | 8 + LICENSE | 39 ++ README.md | 195 +++++++++ coverage.ini | 18 + doc/Makefile | 20 + doc/binary.rst | 25 ++ doc/common.rst | 83 ++++ doc/conf.py | 106 +++++ doc/data.rst | 12 + doc/index.rst | 10 + doc/models.rst | 20 + doc/quant.rst | 11 + doc/release_notes.rst | 33 ++ doc/utils.rst | 40 ++ examples/cifar100/cifar100.py | 24 ++ examples/cifar100/cifar100_fp.yaml | 72 ++++ examples/cifar100/cifar100_ls1.yaml | 81 ++++ examples/cifar100/cifar100_ls1_kd.yaml | 89 ++++ .../cifar100_ls1_weight_fp_activation_kd.yaml | 89 ++++ ...cifar100_ls1_weight_gf2_activation_kd.yaml | 89 ++++ ...cifar100_ls1_weight_ls2_activation_kd.yaml | 89 ++++ ...cifar100_ls1_weight_lsT_activation_kd.yaml | 89 ++++ examples/imagenet/imagenet.py | 24 ++ examples/imagenet/imagenet_fp.yaml | 75 ++++ examples/imagenet/imagenet_ls1_kd.yaml | 88 ++++ .../imagenet_ls1_weight_fp_activation_kd.yaml | 88 ++++ ...imagenet_ls1_weight_gf2_activation_kd.yaml | 88 ++++ ...imagenet_ls1_weight_ls2_activation_kd.yaml | 88 ++++ ...imagenet_ls1_weight_lsT_activation_kd.yaml | 88 ++++ examples/mnist/mnist.py | 22 + examples/mnist/mnist_fp.yaml | 44 ++ examples/mnist/mnist_ls1.yaml | 43 ++ .../mnist/mnist_ls1_weight_fp_activation.yaml | 43 ++ .../mnist_ls1_weight_gf2_activation.yaml | 43 ++ .../mnist_ls1_weight_ls2_activation.yaml | 43 ++ .../mnist_ls1_weight_lsT_activation.yaml | 43 ++ mypy.ini | 32 ++ pyproject.toml | 40 ++ pytest.ini | 28 ++ quant/__init__.py | 29 ++ quant/binary/__init__.py | 6 + quant/binary/activation_quantization.py | 239 +++++++++++ quant/binary/binary_conv.py | 173 ++++++++ quant/binary/optimal.py | 155 +++++++ quant/binary/quantization.py | 148 +++++++ quant/binary/ste.py | 70 +++ quant/binary/weight_quantization.py | 109 +++++ quant/common/__init__.py | 23 + quant/common/compute_platform.py | 114 +++++ quant/common/experiment.py | 125 ++++++ quant/common/initialization.py | 216 ++++++++++ quant/common/metrics.py | 218 ++++++++++ quant/common/parser.py | 261 ++++++++++++ quant/common/tasks.py | 232 ++++++++++ quant/common/training.py | 204 +++++++++ quant/data/__init__.py | 6 + quant/data/data_loaders.py | 375 +++++++++++++++++ quant/models/__init__.py | 6 + quant/models/lenet.py | 94 +++++ quant/models/resnet.py | 397 ++++++++++++++++++ quant/utils/__init__.py | 6 + quant/utils/checkpoints.py | 136 ++++++ quant/utils/kd_criterion.py | 52 +++ quant/utils/linear_lr_scheduler.py | 54 +++ quant/utils/moving_average.py | 39 ++ quant/utils/utils.py | 13 + quant/utils/visualization.py | 116 +++++ quant_logo.png | Bin 0 -> 11865 bytes requirements.txt | 5 + tests/__init__.py | 6 + tests/binary/__init__.py | 6 + tests/binary/test_activation_quantization.py | 258 ++++++++++++ tests/binary/test_binary_conv.py | 107 +++++ tests/binary/test_quantization.py | 165 ++++++++ tests/binary/test_ste.py | 36 ++ tests/binary/test_weight_quantization.py | 81 ++++ tests/common/__init__.py | 6 + tests/common/test_experiment.py | 33 ++ tests/common/test_initialization.py | 165 ++++++++ tests/common/test_metrics.py | 155 +++++++ tests/common/test_parser.py | 45 ++ tests/common/test_tasks.py | 110 +++++ tests/common/test_training.py | 75 ++++ tests/conftest.py | 78 ++++ tests/data/__init__.py | 6 + tests/data/helpers.py | 114 +++++ tests/data/test_data_loaders.py | 87 ++++ tests/models/__init__.py | 6 + tests/models/test_resnet.py | 136 ++++++ tests/utils/__init__.py | 6 + tests/utils/test_linear_lr_scheduler.py | 40 ++ tests/utils/test_moving_average.py | 166 ++++++++ 97 files changed, 8073 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 ACKNOWLEDGEMENTS create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 DATASETS.txt create mode 100644 LICENSE create mode 100644 README.md create mode 100644 coverage.ini create mode 100644 doc/Makefile create mode 100644 doc/binary.rst create mode 100644 doc/common.rst create mode 100644 doc/conf.py create mode 100644 doc/data.rst create mode 100644 doc/index.rst create mode 100644 doc/models.rst create mode 100644 doc/quant.rst create mode 100644 doc/release_notes.rst create mode 100644 doc/utils.rst create mode 100644 examples/cifar100/cifar100.py create mode 100644 examples/cifar100/cifar100_fp.yaml create mode 100644 examples/cifar100/cifar100_ls1.yaml create mode 100644 examples/cifar100/cifar100_ls1_kd.yaml create mode 100644 examples/cifar100/cifar100_ls1_weight_fp_activation_kd.yaml create mode 100644 examples/cifar100/cifar100_ls1_weight_gf2_activation_kd.yaml create mode 100644 examples/cifar100/cifar100_ls1_weight_ls2_activation_kd.yaml create mode 100644 examples/cifar100/cifar100_ls1_weight_lsT_activation_kd.yaml create mode 100644 examples/imagenet/imagenet.py create mode 100644 examples/imagenet/imagenet_fp.yaml create mode 100644 examples/imagenet/imagenet_ls1_kd.yaml create mode 100644 examples/imagenet/imagenet_ls1_weight_fp_activation_kd.yaml create mode 100644 examples/imagenet/imagenet_ls1_weight_gf2_activation_kd.yaml create mode 100644 examples/imagenet/imagenet_ls1_weight_ls2_activation_kd.yaml create mode 100644 examples/imagenet/imagenet_ls1_weight_lsT_activation_kd.yaml create mode 100644 examples/mnist/mnist.py create mode 100644 examples/mnist/mnist_fp.yaml create mode 100644 examples/mnist/mnist_ls1.yaml create mode 100644 examples/mnist/mnist_ls1_weight_fp_activation.yaml create mode 100644 examples/mnist/mnist_ls1_weight_gf2_activation.yaml create mode 100644 examples/mnist/mnist_ls1_weight_ls2_activation.yaml create mode 100644 examples/mnist/mnist_ls1_weight_lsT_activation.yaml create mode 100644 mypy.ini create mode 100644 pyproject.toml create mode 100644 pytest.ini create mode 100644 quant/__init__.py create mode 100644 quant/binary/__init__.py create mode 100644 quant/binary/activation_quantization.py create mode 100644 quant/binary/binary_conv.py create mode 100644 quant/binary/optimal.py create mode 100644 quant/binary/quantization.py create mode 100644 quant/binary/ste.py create mode 100644 quant/binary/weight_quantization.py create mode 100644 quant/common/__init__.py create mode 100644 quant/common/compute_platform.py create mode 100644 quant/common/experiment.py create mode 100644 quant/common/initialization.py create mode 100644 quant/common/metrics.py create mode 100644 quant/common/parser.py create mode 100644 quant/common/tasks.py create mode 100644 quant/common/training.py create mode 100644 quant/data/__init__.py create mode 100644 quant/data/data_loaders.py create mode 100644 quant/models/__init__.py create mode 100644 quant/models/lenet.py create mode 100644 quant/models/resnet.py create mode 100644 quant/utils/__init__.py create mode 100644 quant/utils/checkpoints.py create mode 100644 quant/utils/kd_criterion.py create mode 100644 quant/utils/linear_lr_scheduler.py create mode 100644 quant/utils/moving_average.py create mode 100644 quant/utils/utils.py create mode 100644 quant/utils/visualization.py create mode 100644 quant_logo.png create mode 100644 requirements.txt create mode 100644 tests/__init__.py create mode 100644 tests/binary/__init__.py create mode 100644 tests/binary/test_activation_quantization.py create mode 100644 tests/binary/test_binary_conv.py create mode 100644 tests/binary/test_quantization.py create mode 100644 tests/binary/test_ste.py create mode 100644 tests/binary/test_weight_quantization.py create mode 100644 tests/common/__init__.py create mode 100644 tests/common/test_experiment.py create mode 100644 tests/common/test_initialization.py create mode 100644 tests/common/test_metrics.py create mode 100644 tests/common/test_parser.py create mode 100644 tests/common/test_tasks.py create mode 100644 tests/common/test_training.py create mode 100644 tests/conftest.py create mode 100644 tests/data/__init__.py create mode 100644 tests/data/helpers.py create mode 100644 tests/data/test_data_loaders.py create mode 100644 tests/models/__init__.py create mode 100644 tests/models/test_resnet.py create mode 100644 tests/utils/__init__.py create mode 100644 tests/utils/test_linear_lr_scheduler.py create mode 100644 tests/utils/test_moving_average.py diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..6493ef2 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# +# http://editorconfig.org + +# top-most EditorConfig file +root = true + +# Default Configuration for most files +[*] +indent_style = space +indent_size = 4 +trim_trailing_whitespace = true +insert_final_newline = true +charset = utf-8 +end_of_line = lf + +[Makefile] +indent_style = tab diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..280a421 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.pyc +*.swp +.coverage +.mypy_cache/ +.venv/ +.idea/ +dist/ +runs/ diff --git a/ACKNOWLEDGEMENTS b/ACKNOWLEDGEMENTS new file mode 100644 index 0000000..ce1bba1 --- /dev/null +++ b/ACKNOWLEDGEMENTS @@ -0,0 +1,297 @@ +Acknowledgements + +Portions of ml-quant may utilize the following copyrighted +material, the use of which is hereby acknowledged. + +_____________________ + +AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team (pandas) + BSD 3-Clause License + + Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team + All rights reserved. + + Copyright (c) 2011-2020, Open source contributors. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Holger Krekel and others (pytest) + The MIT License (MIT) + + Copyright (c) 2004-2020 Holger Krekel and others + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +Jukka Lehtosalo and contributors (mypy) + The MIT License + + Copyright (c) 2015-2019 Jukka Lehtosalo and contributors + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + +Ingy döt Net, Kirill Simonov (PyYAML) + Copyright (c) 2017-2020 Ingy döt Net + Copyright (c) 2006-2016 Kirill Simonov + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +pytest-cov Authors (pytest-cov) + The MIT License + + Copyright (c) 2010 Meme Dough + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +The PyTorch Authors (PyTorch) + From PyTorch: + + Copyright (c) 2016- Facebook, Inc (Adam Paszke) + Copyright (c) 2014- Facebook, Inc (Soumith Chintala) + Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) + Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) + Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) + Copyright (c) 2011-2013 NYU (Clement Farabet) + Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) + Copyright (c) 2006 Idiap Research Institute (Samy Bengio) + Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + + From Caffe2: + + Copyright (c) 2016-present, Facebook Inc. All rights reserved. + + All contributions by Facebook: + Copyright (c) 2016 Facebook Inc. + + All contributions by Google: + Copyright (c) 2015 Google Inc. + All rights reserved. + + All contributions by Yangqing Jia: + Copyright (c) 2015 Yangqing Jia + All rights reserved. + + All contributions from Caffe: + Copyright(c) 2013, 2014, 2015, the respective contributors + All rights reserved. + + All other contributions: + Copyright(c) 2015, 2016 the respective contributors + All rights reserved. + + Caffe2 uses a copyright model similar to Caffe: each contributor holds + copyright over their contributions to Caffe2. The project versioning records + all such contribution and copyright details. If a contributor wants to further + mark their specific copyright on a particular contribution, they should + indicate their copyright solely in the commit message of the change when it is + committed. + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America + and IDIAP Research Institute nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +The Sphinx team (Sphinx) + Copyright (c) 2007-2019 by the Sphinx team (see AUTHORS file). + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The TensorFlow Authors (TensorBoard) + Copyright 2017, The TensorFlow Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +The torchvision Authors (torchvision) + BSD 3-Clause License + + Copyright (c) Soumith Chintala 2016, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Tarek Ziade, Ian Cordasco (flake8) + Copyright (C) 2011-2013 Tarek Ziade + Copyright (C) 2012-2016 Ian Cordasco + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..c991377 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,71 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, +available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html) \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..03d1703 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,7 @@ +# Contribution Guide + +Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducability, and beyond its publication there are limited plans for future development of the repository. + +## Before you get started + +We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md). \ No newline at end of file diff --git a/DATASETS.txt b/DATASETS.txt new file mode 100644 index 0000000..f76fd7b --- /dev/null +++ b/DATASETS.txt @@ -0,0 +1,8 @@ +The MNIST, CIFAR-10, CIFAR-100 and ImageNet datasets are not Apple owned or created datasets. + +Your use of such datasets is subject to the third party’s rights and licensing terms. + +Below are links to the original datasets for your review: +* MNIST: http://yann.lecun.com/exdb/mnist/ +* CIFAR-10 and CIFAR-100: https://www.cs.toronto.edu/~kriz/cifar.html +* ImageNet: http://image-net.org/download-faq diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8ce4f2e --- /dev/null +++ b/LICENSE @@ -0,0 +1,39 @@ +Copyright (C) 2020 Apple Inc. All Rights Reserved. + +IMPORTANT: This Apple software is supplied to you by Apple +Inc. ("Apple") in consideration of your agreement to the following +terms, and your use, installation, modification or redistribution of +this Apple software constitutes acceptance of these terms. If you do +not agree with these terms, please do not use, install, modify or +redistribute this Apple software. + +In consideration of your agreement to abide by the following terms, and +subject to these terms, Apple grants you a personal, non-exclusive +license, under Apple's copyrights in this original Apple software (the +"Apple Software"), to use, reproduce, modify and redistribute the Apple +Software, with or without modifications, in source and/or binary forms; +provided that if you redistribute the Apple Software in its entirety and +without modifications, you must retain this notice and the following +text and disclaimers in all such redistributions of the Apple Software. +Neither the name, trademarks, service marks or logos of Apple Inc. may +be used to endorse or promote products derived from the Apple Software +without specific prior written permission from Apple. Except as +expressly stated in this notice, no other rights or licenses, express or +implied, are granted by Apple herein, including but not limited to any +patent rights that may be infringed by your derivative works or by other +works in which the Apple Software may be incorporated. + +The Apple Software is provided by Apple on an "AS IS" basis. APPLE +MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION +THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND +OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS. + +IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL +OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, +MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED +AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE), +STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0248aca --- /dev/null +++ b/README.md @@ -0,0 +1,195 @@ +# Quant + + + +This repository is a PyTorch implementation of [Least Squares Binary Quantization of Neural Networks](http://openaccess.thecvf.com/content_CVPRW_2020/papers/w40/Pouransari_Least_Squares_Binary_Quantization_of_Neural_Networks_CVPRW_2020_paper.pdf) and can be used to reproduce the results in the paper. + +**The code is written to use Python 3.6 or above.** + +## Installation + +To install Quant you first need to clone our repository. + +We suggest you first create a virtual environment and install dependencies in the virtual environment. + +```bash +# Go to repo +cd +# Create virtual environment ... +python -m venv .venv +# ... and activate it +source .venv/bin/activate +# Upgrade to the latest versions of pip and wheel +pip install -U pip wheel +pip install -r requirements.txt +``` + +Then install quant with these commands: + +```bash +pip install flit +flit install -s +``` + +## Quick Start + +To run MNIST training on the local machine, do this: + +```bash +python examples/mnist/mnist.py --config examples/mnist/mnist_fp.yaml --experiment-name mnist-fp +``` + +One can also resume an existing experiment. +For example, here we restore an experiment trained locally on local. +The `--restore-experiment` argument points to the path of a previous experiment, +and `--skip-training` means for the resumed job we would like to only perform evaluation (i.e., no training). + +```bash +python examples/mnist/mnist.py --restore-experiment experiments/mnist-fp --skip-training +``` + +For CIFAR-100 and ImageNet, the CLI interface is the same. +Simply use the configs in the `examples/{mnist,cifar100,imagenet}/` directories. + +[mnist_fp.yaml](./examples/mnist/mnist_fp.yaml), [cifar100_fp.yaml](./examples/cifar100/cifar100_fp.yaml) and [imagenet_fp.yaml](./examples/imagenet/imagenet_fp.yaml) +include comments that list configuration choices for some important parameters with references to documentation sections that explain them in more detail. + +All experiments store the configurations used, overall metrics, checkpoints, and copy +of TensorBoard logs in a directory with the experiment name. +The experiment name can be optionally specified using `--experiment-name `. +If it is not specified, the current datetime with config name is used. + +The experiments artifacts directory looks like this: + +```bash +$ ls experiments/my_experiment_name/ +checkpoints config.yaml metrics tensorboard +``` + +## Experiment Results + +### CIFAR-100 + +We can first train a teacher using: + +```bash +python examples/cifar100/cifar100.py --config examples/cifar100/cifar100_fp.yaml --experiment-name cifar100-teacher +``` + +Then, we can train a quantized student model using a teacher checkpoint in the experiments artifacts directory. +The student config has paths that point to the teacher config / checkpoint. +If you used the command above, the paths in the default config files should refer to the checkpoint you just trained: + +```yaml +kd_config: + teacher_config_path: examples/cifar100/cifar100_fp.yaml + teacher_checkpoint_path: experiments/cifar100-teacher/checkpoints/checkpoint_200.pt +``` + +Then we can train a quantized student model, for example with 2-bits activation: + +```bash +python examples/cifar100/cifar100.py --config examples/cifar100/cifar100_ls1_weight_ls2_activation_kd.yaml --experiment-name cifar100-ls2 +``` + +All configs ending with `*_kd.yaml` use Knowledge Distillation (KD) and require a pre-trained teacher checkpoint. +If you want to train without knowledge distillation, just remove the `kd_config` section from the corresponding config file. +`cifar100_fp.yaml` is a config that does not have this `kd_config` section, for example. + +Here are the results we obtained using the configs in the `examples/cifar100` directory. + +| Config | `k^a` | `k^w` | top-1 accuracy | top-5 accuracy | +| ------------------------------------------------------------------------------------------------------------ |:------:|:------:|:--------------:|:--------------:| +| [cifar100_ls1_kd.yaml](./examples/cifar100/cifar100_ls1_kd.yaml) | 1 | 1 | 71.5 | 92.0 | +| [cifar100_ls1_weight_lsT_activation_kd.yaml](./examples/cifar100/cifar100_ls1_weight_lsT_activation_kd.yaml) | T | 1 | 73.5 | 92.8 | +| [cifar100_ls1_weight_gf2_activation_kd.yaml](./examples/cifar100/cifar100_ls1_weight_gf2_activation_kd.yaml) | 2 | 1 | 74.3 | 93.1 | +| [cifar100_ls1_weight_ls2_activation_kd.yaml](./examples/cifar100/cifar100_ls1_weight_ls2_activation_kd.yaml) | 2 | 1 | 74.4 | 92.9 | +| [cifar100_ls1_weight_fp_activation_kd.yaml](./examples/cifar100/cifar100_ls1_weight_fp_activation_kd.yaml) | 32 | 1 | 76.2 | 93.7 | +| [cifar100_fp.yaml](./examples/cifar100/cifar100_fp.yaml) | 32 | 32 | 77.8 | 93.9 | + +### ImageNet + +The configs in this repo for ImageNet use 8 GPUs. +Please adapt this setting as needed for your setup. + +We can first train a teacher using: + +```bash +python examples/imagenet/imagenet.py --config examples/imagenet/imagenet_fp.yaml --experiment-name imagenet-teacher +``` + +Then, we can train a quantized student model using a teacher checkpoint in the experiments artifacts directory. +The student config has paths that point to the teacher config / checkpoint. +If you used the command above, the paths in the default config files should refer to the checkpoint you just trained: + +```yaml +kd_config: + teacher_config_path: examples/imagenet/imagenet_fp.yaml + teacher_checkpoint_path: experiments/imagenet-teacher/checkpoints/checkpoint_100.pt +``` + +Then we can train a quantized student model, for example with 2-bits activation: + +```bash +python examples/imagenet/imagenet.py --config examples/imagenet/imagenet_ls1_weight_ls2_activation_kd.yaml --experiment-name imagenet-ls2 +``` + +All configs ending with `*_kd.yaml` use Knowledge Distillation (KD) and require a pre-trained teacher checkpoint. +If you want to train without knowledge distillation, just remove the `kd_config` section from the corresponding config file. +`imagenet_fp.yaml` is a config that does not have this `kd_config` section, for example. + +Here are the results we obtained using the configs in the `examples/imagenet` directory. +These configs can be used to reproduce the results in the paper. +The `ls-2` 240 epochs job can take around 9 days, while the `ls-1` 240 epochs job can take around 6 days on 8 x NVIDIA Tesla V100 GPUs. + +| Config | `k^a` | `k^w` | top-1 accuracy | top-5 accuracy | +| ------------------------------------------------------------------------------------------------------------ |:------:|:------:|:--------------:|:--------------:| +| [imagenet_ls1_kd.yaml](./examples/imagenet/imagenet_ls1_kd.yaml) | 1 | 1 | 58.9 | 81.4 | +| [imagenet_ls1_weight_lsT_activation_kd.yaml](./examples/imagenet/imagenet_ls1_weight_lsT_activation_kd.yaml) | T | 1 | 62.0 | 83.6 | +| [imagenet_ls1_weight_gf2_activation_kd.yaml](./examples/imagenet/imagenet_ls1_weight_gf2_activation_kd.yaml) | 2 | 1 | 62.6 | 84.0 | +| [imagenet_ls1_weight_ls2_activation_kd.yaml](./examples/imagenet/imagenet_ls1_weight_ls2_activation_kd.yaml) | 2 | 1 | 63.4 | 84.6 | +| [imagenet_ls1_weight_fp_activation_kd.yaml](./examples/imagenet/imagenet_ls1_weight_fp_activation_kd.yaml) | 32 | 1 | 66.1 | 86.5 | +| [imagenet_fp.yaml](./examples/imagenet/imagenet_fp.yaml) | 32 | 32 | 69.8 | 89.3 | + +## TensorBoard + +The config files in `examples/` all have the TensorBoard server turned on by default. +While training is running, you can go to [http://localhost:6006](http://localhost:6006) to view TensorBoard. +If the `TENSORBOARD_PORT` environment variable is set, it overrides the default port. + +By default, TensorBoard logs are saved under `runs/` (configured via `tensorboard_root` in config files). +You can also run your own `tensorboard` instance pointing to this log directory if you do not want TensorBoard to terminate after training finishes. +The logs are copied to the experiment directory when a run finishes. + +## Tests + +To run the tests, make sure you have followed the installation instructions and then run +the `pytest` from the root directory of this package. This will run all our tests, +static analysis, coverage analysis and style checks. + +## Documentation + +To build the docs you only need to make a directory adjacent to this repo in the parent directory and run the `make html` command. + +```bash +mkdir -p ../quant-docs-build +cd doc +make html +``` + +## Contact + +* **Hadi Pouransari**: mpouransari@apple.com +* **Michael Tu**: zhucheng_tu@apple.com + +## Citation + +```bibtex +@InProceedings{Pouransari_2020_CVPR_Workshops, + author = {Pouransari, Hadi and Tu, Zhucheng and Tuzel, Oncel}, + title = {Least Squares Binary Quantization of Neural Networks}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, + month = {June}, + year = {2020} +} +``` diff --git a/coverage.ini b/coverage.ini new file mode 100644 index 0000000..5db8a53 --- /dev/null +++ b/coverage.ini @@ -0,0 +1,18 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +[run] +# Source for coverage analysis +source = quant + +# We don't care about coverage in the tests themselves +omit = tests/* + +# Perform branching analysis as well +branch = True + +[report] +# Show which lines are not covered by tests +show_missing = True diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..59c7299 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = . +BUILDDIR = ../../quant-docs-build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + cp ../quant_logo.png ../../quant-docs-build/html/ diff --git a/doc/binary.rst b/doc/binary.rst new file mode 100644 index 0000000..0af7699 --- /dev/null +++ b/doc/binary.rst @@ -0,0 +1,25 @@ +.. currentmodule:: quant.binary + +Binary Quantization +=================== + +Convolution +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.binary.binary_conv + :members: + :special-members: __init__ + :undoc-members: + +Quantization Classes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.binary.activation_quantization + :members: + :special-members: __init__ + :undoc-members: + +.. automodule:: quant.binary.weight_quantization + :members: + :special-members: __init__ + :undoc-members: diff --git a/doc/common.rst b/doc/common.rst new file mode 100644 index 0000000..52b355a --- /dev/null +++ b/doc/common.rst @@ -0,0 +1,83 @@ +.. currentmodule:: quant.common + +Common +====== + +This module contains common code for running the code, performing training and evaluation. + +.. note:: + + If you are just running the example code to reproduce the paper, you do not need to read + the sections below :ref:`Config File` and :ref:`CLI Args`. If you want to write your own + driver scripts that use Quant for your tasks, you may find the additional documentation + helpful. + +.. _Config File: + +Config File +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.common.parser + +.. _CLI Args: + +CLI Args +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One can always use ``--help`` if running any of the example scripts to see the arguments. + +``--config `` specifies the path to the yaml config file. + +The experiment can be given a name with ``--experiment-name ``. +If no name is specified a name is chosen based on the dataset name and time. + +``--ngpus `` can be used to set or override the number of GPUs setting +in the config. + +``--init-from-checkpoint `` can be used to initialize the model from a checkpoint. +See :meth:`~quant.utils.checkpoints.restore_from_checkpoint` for more details. +This only stores the model from the checkpoint, but not the optimizer or scheduler state. + +Alternatively, ``--restore-checkpoint `` can be used +to resume training from a checkpoint. The last checkpoint will be used. + +If either ``--init-from-checkpoint`` or ``--restore-checkpoint`` is used, +``--skip-training`` can be set to perform only inference on the test set. + +Initializing Device, Model, and Optimizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.common.initialization + :members: + :special-members: __init__ + :undoc-members: + +Experiment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.common.experiment + :members: + :special-members: __init__ + :undoc-members: + +Compute Platform +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.common.compute_platform + :members: + :special-members: __init__ + :undoc-members: + +Metrics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.common.metrics + :members: + :special-members: __init__ + :undoc-members: + +Training +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.common.training + :members: diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..0ec9a7b --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# + +import os +import sys +sys.path.insert(0, os.path.abspath('../')) + +import quant + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.napoleon', + 'sphinx.ext.autodoc', + 'sphinx_autodoc_typehints', + 'sphinx.ext.intersphinx', + 'm2r' +] + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = ['.rst', '.md'] + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Quant' +copyright = '2020, Apple Inc.' +author = 'Hadi Pouransari, Zhucheng Tu' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# + +# The full version, including alpha/beta/rc tags. +release = quant.__version__ + +# The short X.Y version. +version = quant.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = [] + + +# This add links to the python documentation on all standard python objects. +# Feel free to add further dependencies you want to link to. +# [https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html] +intersphinx_mapping = {'python': ('https://docs.python.org/3', None), + 'torch': ('https://pytorch.org/docs/master/', None)} diff --git a/doc/data.rst b/doc/data.rst new file mode 100644 index 0000000..df9cc13 --- /dev/null +++ b/doc/data.rst @@ -0,0 +1,12 @@ +.. currentmodule:: quant.data + +Data +============== + +Data Loaders +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.data.data_loaders + :members: + :special-members: __init__ + :undoc-members: diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..6ca762b --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,10 @@ +.. mdinclude:: ../README.md + +Documentation Contents +====================== + +.. toctree:: + :maxdepth: 1 + + release_notes + quant diff --git a/doc/models.rst b/doc/models.rst new file mode 100644 index 0000000..f9795b5 --- /dev/null +++ b/doc/models.rst @@ -0,0 +1,20 @@ +.. currentmodule:: quant.models + +Models +============== + +LeNet +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.models.lenet + :members: + :special-members: __init__ + :undoc-members: + +ResNet +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.models.resnet + :members: + :special-members: __init__ + :undoc-members: diff --git a/doc/quant.rst b/doc/quant.rst new file mode 100644 index 0000000..b63604e --- /dev/null +++ b/doc/quant.rst @@ -0,0 +1,11 @@ +API Documentation +================= + +.. toctree:: + :maxdepth: 1 + + binary + common + data + models + utils diff --git a/doc/release_notes.rst b/doc/release_notes.rst new file mode 100644 index 0000000..47db633 --- /dev/null +++ b/doc/release_notes.rst @@ -0,0 +1,33 @@ +============= +Release Notes +============= + +Current Release +=============== + +v.0.2.0 (2020/06/15) +-------------------------------------- + + * Improved documentation + * Module re-organization: move modules from `common` to `utils` + * Fix moving average bugs + * Use original loss function instead of kd loss function for eval + * LeNet quantization bugfixes + * Remove unneeded data augmentation from data loader + +v.0.1.0 (2020/03/30) +-------------------------------------- + + * Initial release of the library + * Support for the following quantization methods: least squares 1-bit (ls-1), 2-bits (ls-2), ternary (ls-T), and greedy foldable (gf) + * Dataset loaders for MNIST, CIFAR-10, CIFAR-100, ImageNet + * Quantized module for ``nn.Conv2d`` + * LeNet and ResNet (regular block and XNOR block variants) models + * Code required for running training and inference + * Support for training with a teacher + * Support for using moving average during inference to avoid re-computing scalars + +Known Issues +------------ + + * If you installed all of the dependencies following the instructions, but get TensorBoard not found, try deactivating the virtualenv and re-activating it. diff --git a/doc/utils.rst b/doc/utils.rst new file mode 100644 index 0000000..0d4854f --- /dev/null +++ b/doc/utils.rst @@ -0,0 +1,40 @@ +.. currentmodule:: quant.utils + +Common +====== + +This module contains utility classes and functions. + +Saving and Restoring Checkpoints +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.utils.checkpoints + :members: + :special-members: __init__ + :undoc-members: + +Utilities for Training and Evaluation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.utils.linear_lr_scheduler + :members: + :special-members: __init__ + :undoc-members: + +.. automodule:: quant.utils.kd_criterion + :members: + :special-members: __init__ + :undoc-members: + +.. automodule:: quant.utils.moving_average + :members: + :special-members: __init__ + :undoc-members: + +TensorBoard Visualization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: quant.utils.visualization + :members: + :special-members: __init__ + :undoc-members: diff --git a/examples/cifar100/cifar100.py b/examples/cifar100/cifar100.py new file mode 100644 index 0000000..85d46fa --- /dev/null +++ b/examples/cifar100/cifar100.py @@ -0,0 +1,24 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Driver script for running CIFAR-100.""" + +from quant.common.compute_platform import LocalComputePlatform +from quant.common.experiment import Experiment +from quant.common.parser import get_base_argument_parser, parse_config +from quant.common.tasks import classification_task +from quant.data.data_loaders import CIFAR100DataLoader +from quant.utils.visualization import get_tensorboard_hooks + + +if __name__ == '__main__': + parser = get_base_argument_parser('Driver script for running CIFAR-100.') + args = parser.parse_args() + config = parse_config(args) + platform = LocalComputePlatform(config['log'].get('root_experiments_dir', '.')) + experiment = Experiment( + classification_task, config, CIFAR100DataLoader, get_tensorboard_hooks + ) + platform.run(experiment) diff --git a/examples/cifar100/cifar100_fp.yaml b/examples/cifar100/cifar100_fp.yaml new file mode 100644 index 0000000..d14dc6c --- /dev/null +++ b/examples/cifar100/cifar100_fp.yaml @@ -0,0 +1,72 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 77.8% +# Top-5 Accuracy: 93.9% +seed: null +environment: + platform: local + ngpus: 1 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/cifar100/ + train_batch_size: 128 + test_batch_size: 100 + workers: 16 +model: + architecture: resnet + loss: cross_entropy # select from {'cross_entropy', 'nll_loss', 'kl_div'}, see get_loss_fn() docs for details. + arch_config: + moving_average_mode: 'off' # select from {'off', 'eval_only', 'train_and_eval'}, see ActivationQuantizer docs for details. + moving_average_momentum: 0.99 + block: regular # select from {'regular', 'xnor'}, see QResNet docs for details. + layer0: + n_in_channels: 64 + kernel_size: 3 + stride: 1 + padding: 1 + bias: false + maxpool: + type: identity + layer1: + x_quant: fp # select from {'fp', 'ls-1', 'ls-T', 'ls-2', 'gf-1', 'gf-2', 'gf-3' (any `gf-k`)}, see QuantConv2d docs for details. + w_quant: fp # select from {'fp', 'ls-1', 'ls-T', 'ls-2', 'gf-1', 'gf-2', 'gf-3' (any `gf-k`)}, see QuantConv2d docs for details. + clamp: + kind: identity # select from {'identity', 'symmetric'}, see QuantConv2d docs for details. + layer2: + x_quant: fp + w_quant: fp + clamp: + kind: identity + layer3: + x_quant: fp + w_quant: fp + clamp: + kind: identity + layer4: + x_quant: fp + w_quant: fp + clamp: + kind: identity + nonlins: ['relu', 'relu'] # A list of 2 strings where each string is in {'relu', 'prelu', 'identity'}. + num_blocks: [2, 2, 2, 2] + output_classes: 100 +optimization: + epochs: 200 + optimizer: + algorithm: sgd # select from {'sgd', 'adam', 'adadelta'}, see get_optimizer() docs for details. + lr: 0.1 + momentum: 0.9 + nesterov: true + weight_decay: 0.0005 + lr_scheduler: + scheduler: step_lr # select from {'step_lr', 'multi_step_lr', 'linear_lr', 'lambda_lr'}, see get_lr_scheduler() docs for details. + step_size: 60 + gamma: 0.2 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 80 diff --git a/examples/cifar100/cifar100_ls1.yaml b/examples/cifar100/cifar100_ls1.yaml new file mode 100644 index 0000000..22b246a --- /dev/null +++ b/examples/cifar100/cifar100_ls1.yaml @@ -0,0 +1,81 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 64.5% +# Top-5 Accuracy: 87.7% +seed: null +environment: + platform: local + ngpus: 1 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/cifar100/ + train_batch_size: 128 + test_batch_size: 100 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 3 + stride: 1 + padding: 1 + bias: false + maxpool: + type: identity + layer1: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['relu', 'relu'] + num_blocks: [2, 2, 2, 2] + output_classes: 100 +optimization: + epochs: 350 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0.0 + lr_scheduler: + scheduler: multi_step_lr + milestones: + - 150 + - 250 + - 320 + gamma: 0.1 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 80 diff --git a/examples/cifar100/cifar100_ls1_kd.yaml b/examples/cifar100/cifar100_ls1_kd.yaml new file mode 100644 index 0000000..e4b7645 --- /dev/null +++ b/examples/cifar100/cifar100_ls1_kd.yaml @@ -0,0 +1,89 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 71.5% +# Top-5 Accuracy: 92.0% +seed: null +environment: + platform: local + ngpus: 1 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/cifar100/ + train_batch_size: 128 + test_batch_size: 100 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 3 + stride: 1 + padding: 1 + bias: false + maxpool: + type: identity + layer1: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['relu', 'relu'] + num_blocks: [2, 2, 2, 2] + output_classes: 100 + kd_config: + teacher_config_path: experiments/cifar100-teacher/config.yaml + teacher_checkpoint_path: experiments/cifar100-teacher/checkpoints/checkpoint_200.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 5 + teacher_correction: true +optimization: + epochs: 350 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0.0 + lr_scheduler: + scheduler: multi_step_lr + milestones: + - 150 + - 250 + - 320 + gamma: 0.1 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 80 diff --git a/examples/cifar100/cifar100_ls1_weight_fp_activation_kd.yaml b/examples/cifar100/cifar100_ls1_weight_fp_activation_kd.yaml new file mode 100644 index 0000000..f66031e --- /dev/null +++ b/examples/cifar100/cifar100_ls1_weight_fp_activation_kd.yaml @@ -0,0 +1,89 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 76.2% +# Top-5 Accuracy: 93.7% +seed: null +environment: + platform: local + ngpus: 1 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/cifar100/ + train_batch_size: 128 + test_batch_size: 100 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 3 + stride: 1 + padding: 1 + bias: false + maxpool: + type: identity + layer1: + x_quant: fp + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: fp + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: fp + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: fp + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['relu', 'relu'] + num_blocks: [2, 2, 2, 2] + output_classes: 100 + kd_config: + teacher_config_path: experiments/cifar100-teacher/config.yaml + teacher_checkpoint_path: experiments/cifar100-teacher/checkpoints/checkpoint_200.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 5 + teacher_correction: true +optimization: + epochs: 350 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0.0 + lr_scheduler: + scheduler: multi_step_lr + milestones: + - 150 + - 250 + - 320 + gamma: 0.1 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 80 diff --git a/examples/cifar100/cifar100_ls1_weight_gf2_activation_kd.yaml b/examples/cifar100/cifar100_ls1_weight_gf2_activation_kd.yaml new file mode 100644 index 0000000..137408d --- /dev/null +++ b/examples/cifar100/cifar100_ls1_weight_gf2_activation_kd.yaml @@ -0,0 +1,89 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 74.3% +# Top-5 Accuracy: 93.1% +seed: null +environment: + platform: local + ngpus: 1 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/cifar100/ + train_batch_size: 128 + test_batch_size: 100 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 3 + stride: 1 + padding: 1 + bias: false + maxpool: + type: identity + layer1: + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['relu', 'relu'] + num_blocks: [2, 2, 2, 2] + output_classes: 100 + kd_config: + teacher_config_path: experiments/cifar100-teacher/config.yaml + teacher_checkpoint_path: experiments/cifar100-teacher/checkpoints/checkpoint_200.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 5 + teacher_correction: true +optimization: + epochs: 350 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0.0 + lr_scheduler: + scheduler: multi_step_lr + milestones: + - 150 + - 250 + - 320 + gamma: 0.1 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 80 diff --git a/examples/cifar100/cifar100_ls1_weight_ls2_activation_kd.yaml b/examples/cifar100/cifar100_ls1_weight_ls2_activation_kd.yaml new file mode 100644 index 0000000..ce12afe --- /dev/null +++ b/examples/cifar100/cifar100_ls1_weight_ls2_activation_kd.yaml @@ -0,0 +1,89 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 74.4% +# Top-5 Accuracy: 92.9% +seed: null +environment: + platform: local + ngpus: 1 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/cifar100/ + train_batch_size: 128 + test_batch_size: 100 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 3 + stride: 1 + padding: 1 + bias: false + maxpool: + type: identity + layer1: + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['relu', 'relu'] + num_blocks: [2, 2, 2, 2] + output_classes: 100 + kd_config: + teacher_config_path: experiments/cifar100-teacher/config.yaml + teacher_checkpoint_path: experiments/cifar100-teacher/checkpoints/checkpoint_200.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 5 + teacher_correction: true +optimization: + epochs: 350 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0.0 + lr_scheduler: + scheduler: multi_step_lr + milestones: + - 150 + - 250 + - 320 + gamma: 0.1 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 80 diff --git a/examples/cifar100/cifar100_ls1_weight_lsT_activation_kd.yaml b/examples/cifar100/cifar100_ls1_weight_lsT_activation_kd.yaml new file mode 100644 index 0000000..af498ab --- /dev/null +++ b/examples/cifar100/cifar100_ls1_weight_lsT_activation_kd.yaml @@ -0,0 +1,89 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 73.5% +# Top-5 Accuracy: 92.8% +seed: null +environment: + platform: local + ngpus: 1 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/cifar100/ + train_batch_size: 128 + test_batch_size: 100 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 3 + stride: 1 + padding: 1 + bias: false + maxpool: + type: identity + layer1: + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['relu', 'relu'] + num_blocks: [2, 2, 2, 2] + output_classes: 100 + kd_config: + teacher_config_path: experiments/cifar100-teacher/config.yaml + teacher_checkpoint_path: experiments/cifar100-teacher/checkpoints/checkpoint_200.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 5 + teacher_correction: true +optimization: + epochs: 350 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0.0 + lr_scheduler: + scheduler: multi_step_lr + milestones: + - 150 + - 250 + - 320 + gamma: 0.1 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 80 diff --git a/examples/imagenet/imagenet.py b/examples/imagenet/imagenet.py new file mode 100644 index 0000000..13c4f2e --- /dev/null +++ b/examples/imagenet/imagenet.py @@ -0,0 +1,24 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Driver script for running ImageNet.""" + +from quant.common.compute_platform import LocalComputePlatform +from quant.common.experiment import Experiment +from quant.common.parser import get_base_argument_parser, parse_config +from quant.common.tasks import classification_task +from quant.data.data_loaders import ImageNetDataLoader +from quant.utils.visualization import get_tensorboard_hooks + + +if __name__ == '__main__': + parser = get_base_argument_parser('Driver script for running ImageNet.') + args = parser.parse_args() + config = parse_config(args) + platform = LocalComputePlatform(config['log'].get('root_experiments_dir', '.')) + experiment = Experiment( + classification_task, config, ImageNetDataLoader, get_tensorboard_hooks + ) + platform.run(experiment) diff --git a/examples/imagenet/imagenet_fp.yaml b/examples/imagenet/imagenet_fp.yaml new file mode 100644 index 0000000..5ca76a8 --- /dev/null +++ b/examples/imagenet/imagenet_fp.yaml @@ -0,0 +1,75 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 69.8% +# Top-5 Accuracy: 89.3% +seed: null +environment: + platform: local + ngpus: 8 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/imagenet/ + train_batch_size: 256 + test_batch_size: 256 + workers: 16 +model: + architecture: resnet + loss: cross_entropy # select from {'cross_entropy', 'nll_loss', 'kl_div'}, see get_loss_fn() docs for details. + arch_config: + moving_average_mode: 'off' # select from {'off', 'eval_only', 'train_and_eval'}, see ActivationQuantizer docs for details. + moving_average_momentum: 0.99 + block: regular # select from {'regular', 'xnor'}, see QResNet docs for details. + layer0: + n_in_channels: 64 + kernel_size: 7 + stride: 2 + padding: 3 + bias: false + maxpool: + type: maxpool2d + kernel_size: 3 + stride: 2 + padding: 1 + layer1: + x_quant: fp # select from {'fp', 'ls-1', 'ls-T', 'ls-2', 'gf-1', 'gf-2', 'gf-3' (any `gf-k`)}, see QuantConv2d docs for details. + w_quant: fp # select from {'fp', 'ls-1', 'ls-T', 'ls-2', 'gf-1', 'gf-2', 'gf-3' (any `gf-k`)}, see QuantConv2d docs for details. + clamp: + kind: identity # select from {'identity', 'symmetric'}, see QuantConv2d docs for details. + layer2: + x_quant: fp + w_quant: fp + clamp: + kind: identity + layer3: + x_quant: fp + w_quant: fp + clamp: + kind: identity + layer4: + x_quant: fp + w_quant: fp + clamp: + kind: identity + nonlins: ['relu', 'relu'] # A list of 2 strings where each string is in {'relu', 'prelu', 'identity'}. + num_blocks: [2, 2, 2, 2] + output_classes: 1000 +optimization: + epochs: 100 + optimizer: + algorithm: sgd # select from {'sgd', 'adam', 'adadelta'}, see get_optimizer() docs for details. + lr: 0.1 + momentum: 0.9 + nesterov: true + weight_decay: 0.0001 + lr_scheduler: + scheduler: step_lr # select from {'step_lr', 'multi_step_lr', 'linear_lr', 'lambda_lr'}, see get_lr_scheduler() docs for details. + gamma: 0.1 + step_size: 30 +log: + level: INFO + interval: 80 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 20 diff --git a/examples/imagenet/imagenet_ls1_kd.yaml b/examples/imagenet/imagenet_ls1_kd.yaml new file mode 100644 index 0000000..1554dd5 --- /dev/null +++ b/examples/imagenet/imagenet_ls1_kd.yaml @@ -0,0 +1,88 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 58.9% +# Top-5 Accuracy: 81.4% +seed: null +environment: + platform: local + ngpus: 8 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/imagenet/ + train_batch_size: 256 + test_batch_size: 256 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 7 + stride: 2 + padding: 3 + bias: false + maxpool: + type: maxpool2d + kernel_size: 3 + stride: 2 + padding: 1 + layer1: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['prelu', 'prelu'] + num_blocks: [2, 2, 2, 2] + output_classes: 1000 + kd_config: + teacher_config_path: experiments/imagenet-teacher/config.yaml + teacher_checkpoint_path: experiments/imagenet-teacher/checkpoints/checkpoint_100.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 1 + teacher_correction: false +optimization: + epochs: 240 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0 + lr_scheduler: + scheduler: linear_lr + min_lr: 2e-7 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 20 diff --git a/examples/imagenet/imagenet_ls1_weight_fp_activation_kd.yaml b/examples/imagenet/imagenet_ls1_weight_fp_activation_kd.yaml new file mode 100644 index 0000000..00e499f --- /dev/null +++ b/examples/imagenet/imagenet_ls1_weight_fp_activation_kd.yaml @@ -0,0 +1,88 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 66.1% +# Top-5 Accuracy: 86.5% +seed: null +environment: + platform: local + ngpus: 8 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/imagenet/ + train_batch_size: 256 + test_batch_size: 256 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 7 + stride: 2 + padding: 3 + bias: false + maxpool: + type: maxpool2d + kernel_size: 3 + stride: 2 + padding: 1 + layer1: + x_quant: fp + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: fp + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: fp + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: fp + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['prelu', 'prelu'] + num_blocks: [2, 2, 2, 2] + output_classes: 1000 + kd_config: + teacher_config_path: experiments/imagenet-teacher/config.yaml + teacher_checkpoint_path: experiments/imagenet-teacher/checkpoints/checkpoint_100.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 1 + teacher_correction: false +optimization: + epochs: 240 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0 + lr_scheduler: + scheduler: linear_lr + min_lr: 2e-7 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 20 diff --git a/examples/imagenet/imagenet_ls1_weight_gf2_activation_kd.yaml b/examples/imagenet/imagenet_ls1_weight_gf2_activation_kd.yaml new file mode 100644 index 0000000..b89a3de --- /dev/null +++ b/examples/imagenet/imagenet_ls1_weight_gf2_activation_kd.yaml @@ -0,0 +1,88 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 62.6% +# Top-5 Accuracy: 84.0% +seed: null +environment: + platform: local + ngpus: 8 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/imagenet/ + train_batch_size: 256 + test_batch_size: 256 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 7 + stride: 2 + padding: 3 + bias: false + maxpool: + type: maxpool2d + kernel_size: 3 + stride: 2 + padding: 1 + layer1: + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 3 + double_shortcut: true + layer2: + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 3 + double_shortcut: true + layer3: + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 3 + double_shortcut: true + layer4: + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 3 + double_shortcut: true + nonlins: ['prelu', 'prelu'] + num_blocks: [2, 2, 2, 2] + output_classes: 1000 + kd_config: + teacher_config_path: experiments/imagenet-teacher/config.yaml + teacher_checkpoint_path: experiments/imagenet-teacher/checkpoints/checkpoint_100.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 1 + teacher_correction: false +optimization: + epochs: 240 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0 + lr_scheduler: + scheduler: linear_lr + min_lr: 2e-7 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 40 diff --git a/examples/imagenet/imagenet_ls1_weight_ls2_activation_kd.yaml b/examples/imagenet/imagenet_ls1_weight_ls2_activation_kd.yaml new file mode 100644 index 0000000..bd8c3f6 --- /dev/null +++ b/examples/imagenet/imagenet_ls1_weight_ls2_activation_kd.yaml @@ -0,0 +1,88 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 63.4% +# Top-5 Accuracy: 84.6% +seed: null +environment: + platform: local + ngpus: 8 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/imagenet/ + train_batch_size: 256 + test_batch_size: 256 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 7 + stride: 2 + padding: 3 + bias: false + maxpool: + type: maxpool2d + kernel_size: 3 + stride: 2 + padding: 1 + layer1: + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 3 + double_shortcut: true + layer2: + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 3 + double_shortcut: true + layer3: + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 3 + double_shortcut: true + layer4: + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 3 + double_shortcut: true + nonlins: ['relu', 'relu'] + num_blocks: [2, 2, 2, 2] + output_classes: 1000 + kd_config: + teacher_config_path: experiments/imagenet-teacher/config.yaml + teacher_checkpoint_path: experiments/imagenet-teacher/checkpoints/checkpoint_100.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 1 + teacher_correction: false +optimization: + epochs: 240 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0 + lr_scheduler: + scheduler: linear_lr + min_lr: 2e-7 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 20 diff --git a/examples/imagenet/imagenet_ls1_weight_lsT_activation_kd.yaml b/examples/imagenet/imagenet_ls1_weight_lsT_activation_kd.yaml new file mode 100644 index 0000000..6f864f4 --- /dev/null +++ b/examples/imagenet/imagenet_ls1_weight_lsT_activation_kd.yaml @@ -0,0 +1,88 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 62.0% +# Top-5 Accuracy: 83.6% +seed: null +environment: + platform: local + ngpus: 8 + cuda: + cudnn_deterministic: false + cudnn_benchmark: true +data: + dataset_path: data/imagenet/ + train_batch_size: 256 + test_batch_size: 256 + workers: 16 +model: + architecture: resnet + loss: cross_entropy + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + block: xnor + layer0: + n_in_channels: 64 + kernel_size: 7 + stride: 2 + padding: 3 + bias: false + maxpool: + type: maxpool2d + kernel_size: 3 + stride: 2 + padding: 1 + layer1: + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer2: + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer3: + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + layer4: + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: symmetric + alpha: 2 + double_shortcut: true + nonlins: ['prelu', 'prelu'] + num_blocks: [2, 2, 2, 2] + output_classes: 1000 + kd_config: + teacher_config_path: experiments/imagenet-teacher/config.yaml + teacher_checkpoint_path: experiments/imagenet-teacher/checkpoints/checkpoint_100.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 1 + teacher_correction: false +optimization: + epochs: 240 + optimizer: + algorithm: adam + lr: 0.0002 + weight_decay: 0 + lr_scheduler: + scheduler: linear_lr + min_lr: 2e-7 +log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 40 diff --git a/examples/mnist/mnist.py b/examples/mnist/mnist.py new file mode 100644 index 0000000..eac1bd8 --- /dev/null +++ b/examples/mnist/mnist.py @@ -0,0 +1,22 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Driver script for running MNIST.""" + +from quant.common.compute_platform import LocalComputePlatform +from quant.common.experiment import Experiment +from quant.common.parser import get_base_argument_parser, parse_config +from quant.common.tasks import classification_task +from quant.data.data_loaders import MNISTDataLoader +from quant.utils.visualization import get_tensorboard_hooks + + +if __name__ == '__main__': + parser = get_base_argument_parser('Driver script for running MNIST.') + args = parser.parse_args() + config = parse_config(args) + platform = LocalComputePlatform(config['log'].get('root_experiments_dir', '.')) + experiment = Experiment(classification_task, config, MNISTDataLoader, get_tensorboard_hooks) + platform.run(experiment) diff --git a/examples/mnist/mnist_fp.yaml b/examples/mnist/mnist_fp.yaml new file mode 100644 index 0000000..dfd25c5 --- /dev/null +++ b/examples/mnist/mnist_fp.yaml @@ -0,0 +1,44 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 99.4% +# Top-5 Accuracy: 100.0% +seed: null +environment: + platform: local + cuda: + cudnn_deterministic: true + cudnn_benchmark: false +data: + dataset_path: data/mnist/ + download: true + train_batch_size: 64 + test_batch_size: 5000 + workers: 4 +model: + architecture: lenet5 + loss: nll_loss # select from {'cross_entropy', 'nll_loss', 'kl_div'}, see get_loss_fn() docs for details. + arch_config: + moving_average_mode: 'off' # select from {'off', 'eval_only', 'train_and_eval'}, see ActivationQuantizer docs for details. + moving_average_momentum: 0.99 + x_quant: fp # select from {'fp', 'ls-1', 'ls-T', 'ls-2', 'gf-1', 'gf-2', 'gf-3' (any `gf-k`)}, see QuantConv2d docs for details. + w_quant: fp # select from {'fp', 'ls-1', 'ls-T', 'ls-2', 'gf-1', 'gf-2', 'gf-3' (any `gf-k`)}, see QuantConv2d docs for details. + clamp: + kind: identity # select from {'identity', 'symmetric'}, see QuantConv2d docs for details. + conv1_filters: 20 + conv2_filters: 50 + output_classes: 10 +optimization: + epochs: 10 + optimizer: + algorithm: adadelta # select from {'sgd', 'adam', 'adadelta'}, see get_optimizer() docs for details. + lr: 1.0 + lr_scheduler: + scheduler: step_lr # select from {'step_lr', 'multi_step_lr', 'linear_lr', 'lambda_lr'}, see get_lr_scheduler() docs for details. + step_size: 1 + gamma: 0.7 +log: + level: INFO + interval: 10 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 2 diff --git a/examples/mnist/mnist_ls1.yaml b/examples/mnist/mnist_ls1.yaml new file mode 100644 index 0000000..203ea0c --- /dev/null +++ b/examples/mnist/mnist_ls1.yaml @@ -0,0 +1,43 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 99.2% +# Top-5 Accuracy: 100.0% +seed: null +environment: + platform: local + cuda: + cudnn_deterministic: true + cudnn_benchmark: false +data: + dataset_path: data/mnist/ + train_batch_size: 64 + test_batch_size: 5000 + workers: 4 +model: + architecture: lenet5 + loss: nll_loss + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + x_quant: ls-1 + w_quant: ls-1 + clamp: + kind: identity + conv1_filters: 20 + conv2_filters: 50 + output_classes: 10 +optimization: + epochs: 10 + optimizer: + algorithm: adadelta + lr: 1.0 + lr_scheduler: + scheduler: step_lr + step_size: 1 + gamma: 0.7 +log: + level: INFO + interval: 10 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 2 diff --git a/examples/mnist/mnist_ls1_weight_fp_activation.yaml b/examples/mnist/mnist_ls1_weight_fp_activation.yaml new file mode 100644 index 0000000..07df78e --- /dev/null +++ b/examples/mnist/mnist_ls1_weight_fp_activation.yaml @@ -0,0 +1,43 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 99.4% +# Top-5 Accuracy: 100.0% +seed: null +environment: + platform: local + cuda: + cudnn_deterministic: true + cudnn_benchmark: false +data: + dataset_path: data/mnist/ + train_batch_size: 64 + test_batch_size: 5000 + workers: 4 +model: + architecture: lenet5 + loss: nll_loss + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + x_quant: fp + w_quant: ls-1 + clamp: + kind: identity + conv1_filters: 20 + conv2_filters: 50 + output_classes: 10 +optimization: + epochs: 10 + optimizer: + algorithm: adadelta + lr: 1.0 + lr_scheduler: + scheduler: step_lr + step_size: 1 + gamma: 0.7 +log: + level: INFO + interval: 10 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 2 diff --git a/examples/mnist/mnist_ls1_weight_gf2_activation.yaml b/examples/mnist/mnist_ls1_weight_gf2_activation.yaml new file mode 100644 index 0000000..3556979 --- /dev/null +++ b/examples/mnist/mnist_ls1_weight_gf2_activation.yaml @@ -0,0 +1,43 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 99.2% +# Top-5 Accuracy: 100.0% +seed: null +environment: + platform: local + cuda: + cudnn_deterministic: true + cudnn_benchmark: false +data: + dataset_path: data/mnist/ + train_batch_size: 64 + test_batch_size: 5000 + workers: 4 +model: + architecture: lenet5 + loss: nll_loss + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + x_quant: gf-2 + w_quant: ls-1 + clamp: + kind: identity + conv1_filters: 20 + conv2_filters: 50 + output_classes: 10 +optimization: + epochs: 10 + optimizer: + algorithm: adadelta + lr: 1.0 + lr_scheduler: + scheduler: step_lr + step_size: 1 + gamma: 0.7 +log: + level: INFO + interval: 10 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 2 diff --git a/examples/mnist/mnist_ls1_weight_ls2_activation.yaml b/examples/mnist/mnist_ls1_weight_ls2_activation.yaml new file mode 100644 index 0000000..28d5e65 --- /dev/null +++ b/examples/mnist/mnist_ls1_weight_ls2_activation.yaml @@ -0,0 +1,43 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 99.3% +# Top-5 Accuracy: 100.0% +seed: null +environment: + platform: local + cuda: + cudnn_deterministic: true + cudnn_benchmark: false +data: + dataset_path: data/mnist/ + train_batch_size: 64 + test_batch_size: 5000 + workers: 4 +model: + architecture: lenet5 + loss: nll_loss + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + x_quant: ls-2 + w_quant: ls-1 + clamp: + kind: identity + conv1_filters: 20 + conv2_filters: 50 + output_classes: 10 +optimization: + epochs: 10 + optimizer: + algorithm: adadelta + lr: 1.0 + lr_scheduler: + scheduler: step_lr + step_size: 1 + gamma: 0.7 +log: + level: INFO + interval: 10 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 2 diff --git a/examples/mnist/mnist_ls1_weight_lsT_activation.yaml b/examples/mnist/mnist_ls1_weight_lsT_activation.yaml new file mode 100644 index 0000000..c3b1ca9 --- /dev/null +++ b/examples/mnist/mnist_ls1_weight_lsT_activation.yaml @@ -0,0 +1,43 @@ +# Validation set evaluation metrics: +# Top-1 Accuracy: 99.2% +# Top-5 Accuracy: 100.0% +seed: null +environment: + platform: local + cuda: + cudnn_deterministic: true + cudnn_benchmark: false +data: + dataset_path: data/mnist/ + train_batch_size: 64 + test_batch_size: 5000 + workers: 4 +model: + architecture: lenet5 + loss: nll_loss + arch_config: + moving_average_mode: 'off' + moving_average_momentum: 0.99 + x_quant: ls-T + w_quant: ls-1 + clamp: + kind: identity + conv1_filters: 20 + conv2_filters: 50 + output_classes: 10 +optimization: + epochs: 10 + optimizer: + algorithm: adadelta + lr: 1.0 + lr_scheduler: + scheduler: step_lr + step_size: 1 + gamma: 0.7 +log: + level: INFO + interval: 10 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 2 diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..e6c9fa7 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,32 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +[mypy] + +# You may need to copy, paste and uncomment the following snippet for libraries that do not +# support mypy yet. + +#[mypy-.*] +#ignore_missing_imports = True + +[mypy-pytest.*] +ignore_missing_imports = True + +[mypy-pandas.*] +ignore_missing_imports = True + +[mypy-torchvision.*] +ignore_missing_imports = True + +# These are the settings for your own code +[mypy-quant.*] +# Disallow calls from functions with type annotation to functions with no type annotations +disallow_untyped_calls = True +# Disallow defs with no or incomplete type annotations +disallow_untyped_defs = True +# Type-check inside functions with no type annotations +check_untyped_defs = True +# Warns about uneeded ignore comments +warn_unused_ignores = True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f63acf5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,40 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +# Tell pip to use flit to build this package +[build-system] +requires = ["flit"] +build-backend = "flit.buildapi" + +[tool.flit.metadata] +module = "quant" +author = "Hadi Pouransari, Zhucheng Tu" +author-email = "mpouransari@apple.com, zhucheng_tu@apple.com" + +license = "Apple Sample Code License" +requires-python = ">=3.6,<8" +description-file="README.md" + +# List here all your dependencies +requires = [ +] + +[tool.flit.metadata.requires-extra] +# Packages required for testing +test = [ + "pytest", + "pytest-mypy", + "pytest-flake8", + "pytest-cov", + "flake8-docstrings", + "flake8-copyright", +] +# Packages required to build the documentation +doc = [ + "sphinx", + "sphinx-rtd-theme", + "sphinx-autodoc-typehints", + "m2r" +] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..3643673 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,28 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +[pytest] +# This determines where test are found +testpaths = quant/ tests/ + +# Run pytest with these options by default +# Enables: mypy, flake8, and coverage.py +addopts = + --mypy + --flake8 + --cov quant + --cov-config coverage.ini + --cov-report term + +# flake8 configuration options (plugin does not yet allow us to place it in its own file) +flake8-extensions = .py +flake8-ignore = + __init__.py F401 # Ignore unused imports in __init__.py's + tests/*.py D1 # Ignore documentation issues in tests + D107 # Ignore lack of documentation in __init__ magic methods +flake8-max-line-length = 100 + +markers = + incremental: mark an incremental test, a test that performs a sequence of steps and stops when any step fails. diff --git a/quant/__init__.py b/quant/__init__.py new file mode 100644 index 0000000..1db0011 --- /dev/null +++ b/quant/__init__.py @@ -0,0 +1,29 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""A toolkit supporting binary quantization of neural networks.""" + +from typing import Dict, Optional +from typing_extensions import Protocol + +from quant.common.metrics import Metric + +__version__ = '0.2.0' + + +# Define some common types here + +MetricDict = Dict[str, Metric] + + +class Hook(Protocol): + """Hook protocol.""" + + def __call__( + self, epoch: int, global_step: int, + log_interval: int = 10, values_dict: Optional[dict] = None + ) -> None: + """Define function signature for a hook.""" + ... diff --git a/quant/binary/__init__.py b/quant/binary/__init__.py new file mode 100644 index 0000000..4517dbe --- /dev/null +++ b/quant/binary/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Package containing code for binary quantization.""" diff --git a/quant/binary/activation_quantization.py b/quant/binary/activation_quantization.py new file mode 100644 index 0000000..068ccf6 --- /dev/null +++ b/quant/binary/activation_quantization.py @@ -0,0 +1,239 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Activation quantization.""" + +from abc import abstractmethod +from enum import Enum +from typing import List, Tuple + +import torch +import torch.nn as nn + +import quant.binary.quantization as quantization +from quant.utils.moving_average import MovingAverage + + +class MovingAverageMode(Enum): + """ + Mode for moving average. + + See :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + """ + + off = 'off' + eval_only = 'eval_only' + train_and_eval = 'train_and_eval' + + +class ActivationQuantizer(nn.Module): + """ + Activation quantizer abstract class. + + The moving average mode can have 3 options: 'off', 'eval_only', or 'train_and_eval'. + + When moving_average_mode is 'off', moving average is not used. + + When moving_average_mode is 'eval_only', the moving average is tracked but not used + during training and only used during evaluation mode. + + When moving_average_mode is 'train_and_eval' the moving average is tracked and applied + during training and used during evaluation as well. + + Currently, 'train_and_eval' can only be used with a single GPU + and does not support ``nn.DataParallel``. + + The momentum is a value in [0, 1] used in exponential moving average update. + If the momentum is `alpha`, the update function is: + `alpha * x + (1 - alpha) * x_new` + """ + + def __init__( + self, + num_scaling_factors: int, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """Construct an activation quantizer.""" + super(ActivationQuantizer, self).__init__() + + momentum_vec = [moving_average_momentum] * num_scaling_factors + + self.num_scaling_factors = num_scaling_factors + self.moving_avg_module = MovingAverage(torch.tensor(momentum_vec)) + self.moving_average_mode = MovingAverageMode(moving_average_mode) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of quantizing activation.""" + if self.training: + # batch_vs is a 2D tensor that stores each v_i along each row + batch_vs, x_q = self._batch_quantization(x) + + if self.moving_average_mode != MovingAverageMode.off: + vs_batch_avg = batch_vs.mean(1) + # Calling moving_avg_module will update its internal statistics under the hood. + # This is similar to the forward pass of batch norm. + moving_avg_vs = self.moving_avg_module(vs_batch_avg) + + if self.moving_average_mode == MovingAverageMode.train_and_eval: + # If we want to use the scalars with moving average, we need to expand + # every scaling factor tensor to the batch size from a single mean element. + vs = [ + moving_avg_vs[i].expand(x.shape[0]) + for i in range(self.num_scaling_factors) + ] + + x_q = self._moving_average_quantization(x, vs) + else: + if self.moving_average_mode != MovingAverageMode.off: + # If we want to use the scalars with moving average, we need to expand + # every scaling factor tensor to the batch size from a single mean element. + vs = [ + self.moving_avg_module.moving_average[i].expand(x.shape[0]) # type: ignore + for i in range(self.moving_avg_module.moving_average.size(0)) # type: ignore + ] + + x_q = self._moving_average_quantization(x, vs) + else: + batch_vs, x_q = self._batch_quantization(x) + + return x_q + + @abstractmethod + def _batch_quantization(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Return a 2-tuple of (scaling factors, quantized x).""" + raise NotImplementedError # pragma: no cover + + @abstractmethod + def _moving_average_quantization( + self, x: torch.Tensor, vs: List[torch.Tensor] + ) -> torch.Tensor: + """Return quantized x using vs.""" + raise NotImplementedError # pragma: no cover + + +class ActivationQuantizerLS1(ActivationQuantizer): + """Activation quantizer using least squares, 1 bit.""" + + def __init__( + self, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """Construct an activation quantizer using least squares with 1 bit.""" + super(ActivationQuantizerLS1, self).__init__( + 1, moving_average_mode, moving_average_momentum + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of quantizing activation using least squares 1 bit.""" + return super(ActivationQuantizerLS1, self).forward(x) + + def _batch_quantization(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Return a 2-tuple of (scaling factors, quantized x).""" + batch_v1, x_q = quantization.quantizer_ls_1(x) + return batch_v1.view(1, -1), x_q + + def _moving_average_quantization( + self, x: torch.Tensor, vs: List[torch.Tensor] + ) -> torch.Tensor: + """Return quantized x using vs.""" + v1 = vs[0] + _, x_q = quantization.quantizer_ls_1(x, v1) + return x_q + + +class ActivationQuantizerLS2(ActivationQuantizer): + """Activation quantizer using least squares, 2 bits.""" + + def __init__( + self, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """Construct an activation quantizer using least squares with 2 bit.""" + super(ActivationQuantizerLS2, self).__init__( + 2, moving_average_mode, moving_average_momentum + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of quantizing activation using least squares 2 bits.""" + return super(ActivationQuantizerLS2, self).forward(x) + + def _batch_quantization(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Return a 2-tuple of (scaling factors, quantized x).""" + batch_v1, batch_v2, x_q = quantization.quantizer_ls_2(x) + return torch.stack([batch_v1, batch_v2]), x_q + + def _moving_average_quantization( + self, x: torch.Tensor, vs: List[torch.Tensor] + ) -> torch.Tensor: + """Return quantized x using vs.""" + v1, v2 = vs[0], vs[1] + _, _, x_q = quantization.quantizer_ls_2(x, v1, v2) + return x_q + + +class ActivationQuantizerLST(ActivationQuantizer): + """Activation quantizer using least squares, ternary.""" + + def __init__( + self, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """Construct an activation quantizer using least squares, ternary.""" + super(ActivationQuantizerLST, self).__init__( + 1, moving_average_mode, moving_average_momentum + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of quantizing activation using least squares ternary.""" + return super(ActivationQuantizerLST, self).forward(x) + + def _batch_quantization(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Return a 2-tuple of (scaling factors, quantized x).""" + batch_v1, x_q = quantization.quantizer_ls_ternary(x) + return batch_v1.view(1, -1), x_q + + def _moving_average_quantization( + self, x: torch.Tensor, vs: List[torch.Tensor] + ) -> torch.Tensor: + """Return quantized x using vs.""" + v1 = vs[0] + _, x_q = quantization.quantizer_ls_ternary(x, v1) + return x_q + + +class ActivationQuantizerGF(ActivationQuantizer): + """Activation greedy foldable quantizer.""" + + def __init__( + self, + k: int, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """Construct a greedy-foldable quantizer with `k`-bits.""" + super(ActivationQuantizerGF, self).__init__( + k, moving_average_mode, moving_average_momentum + ) + self.k = k + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of greedy foldable quantizer with `k`-bits.""" + return super(ActivationQuantizerGF, self).forward(x) + + def _batch_quantization(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Return a 2-tuple of (scaling factors, quantized x).""" + batch_vs, x_q = quantization.quantizer_gf(x, self.k) + return torch.stack(batch_vs), x_q + + def _moving_average_quantization( + self, x: torch.Tensor, vs: List[torch.Tensor] + ) -> torch.Tensor: + """Return quantized x using vs.""" + _, x_q = quantization.quantizer_gf(x, self.k, vs) + return x_q diff --git a/quant/binary/binary_conv.py b/quant/binary/binary_conv.py new file mode 100644 index 0000000..5ec1ec6 --- /dev/null +++ b/quant/binary/binary_conv.py @@ -0,0 +1,173 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +Convolution layers that support different scaled binary quantization algorithms. + +Layers support separate quantization schemes for activations and weights. +Activation quantization scheme (`x_quant`) and weight quantization scheme (`w_quant`) +can have the following options: `fp`, `ls-1`, `ls-2`, `ls-T`, `gf-k` (`gf-1`, `gf-2`, etc.). +They refer to different algorithms used for quantization. + +`fp` means use full precision (no quantization), so the behavior should be the same as regular +PyTorch ``nn.Conv2d`` assuming identity clamping. + +`ls-1` refers to the least squares 1-bit algorithm. + +`ls-2` refers to the least squares 2-bits algorithm. + +`ls-T` refers to the ternary algorithm. + +`gf-k` refers to the `k`-bits greedy foldable algorithm. Specific instantiations include `gf-1`, +`gf-2`, `gf-3`, etc. + +Layers also optionally take a clamp parameter for activation. +This is a dictionary with at least one key. +The mandatory key is `kind` and it can take on the values of either `identity` or `symmetric`. +`identity` means that no clamping is performed (default). +`symmetric` means that the activation is clamped between `[-alpha, alpha]`, where `alpha` is +specified as another key. +""" + +from collections import defaultdict +from functools import partial +import re +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import quant.binary.quantization as quantization +import quant.binary.activation_quantization as activation_quantization +import quant.binary.weight_quantization as weight_quantization + + +class QuantConv2d(nn.Conv2d): + """ + 2D convolution based on scaled binary quantization. + + This performs `Conv2d(w_quant(w), x_quant(clamp(x))` where Conv2d is the regular 2D convolution. + """ + + def __init__( + self, + x_quant: str, + w_quant: str, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + clamp: Optional[Dict] = None, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + **kwargs: Any, + ): + """ + Construct a QuantConv2d instance. + + Args: + x_quant: quantization scheme for activations + w_quant: quantization scheme for weights + clamp: clamping scheme for activations + in_channels: number of input channels + out_channels: number of output channels + kernel_size: size of convolving kernel + moving_average_mode: moving average mode to use, + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + moving_average_momentum: momentum for moving average update, + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + """ + super(QuantConv2d, self).__init__(in_channels, out_channels, kernel_size, **kwargs) + + if clamp is None: + clamp = {'kind': 'identity'} + + self.x_approximate = self._get_x_quantizer( + x_quant, moving_average_mode, moving_average_momentum) + self.w_approximate = self._get_w_quantizer(w_quant, out_channels) + + self.clamping_fn = self._get_clamper(**clamp) + + self.quantized_parameters: Dict[str, List[torch.Tensor]] = defaultdict(list) + if self.bias is not None: + self.quantized_parameters['fp'].append(self.bias) + self.quantized_parameters[w_quant].append(self.weight) + + @staticmethod + def _validate_scheme(scheme: str) -> None: + if scheme not in {'fp', 'ls-1', 'ls-T', 'ls-2'} and not re.fullmatch(r'gf-\d+', scheme): + raise ValueError(f'Scheme {scheme} is invalid. Please see docs for valid schemes.') + + @staticmethod + def _get_x_quantizer( + scheme: str, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> nn.Module: + """Get activation quantizer from quantizer scheme.""" + QuantConv2d._validate_scheme(scheme) + + if scheme == 'fp': + return quantization.QuantizerFP() + elif scheme.startswith('ls'): + quantizer_map = { + 'ls-1': activation_quantization.ActivationQuantizerLS1, + 'ls-2': activation_quantization.ActivationQuantizerLS2, + 'ls-T': activation_quantization.ActivationQuantizerLST, + } + return quantizer_map[scheme]( # type: ignore + moving_average_mode, moving_average_momentum + ) + else: # must be gf-k + k = int(scheme.split('-')[1]) + return activation_quantization.ActivationQuantizerGF( + k, moving_average_mode, moving_average_momentum + ) + + @staticmethod + def _get_w_quantizer(scheme: str, size: int) -> nn.Module: + """Get weight quantizer function from quantizer scheme.""" + QuantConv2d._validate_scheme(scheme) + + if scheme == 'fp': + return quantization.QuantizerFP() + elif scheme.startswith('ls'): + quantizer_map = { + 'ls-1': weight_quantization.WeightQuantizerLS1, + 'ls-2': weight_quantization.WeightQuantizerLS2, + 'ls-T': weight_quantization.WeightQuantizerLST, + } + return quantizer_map[scheme](size) + else: # must be gf-k + k = int(scheme.split('-')[1]) + return weight_quantization.WeightQuantizerGF(size, k) + + @staticmethod + def _get_clamper( + kind: str, alpha: float = 2 + ) -> Callable[[torch.Tensor], torch.Tensor]: + """Get clamping function from kind of clamping function.""" + try: + clamper_map: Dict[str, Callable[[torch.Tensor], torch.Tensor]] = { + 'identity': quantization.clamp_identity, + 'symmetric': partial(quantization.clamp_symmetric, alpha=alpha), + } + return clamper_map[kind] + except KeyError: + raise ValueError(f"{kind} is not a valid clamping function.") + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of this layer.""" + x_q = self.x_approximate(self.clamping_fn(x)) + w_q = self.w_approximate(self.weight) + return F.conv2d( + input=x_q, + weight=w_q, + bias=self.bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + ) diff --git a/quant/binary/optimal.py b/quant/binary/optimal.py new file mode 100644 index 0000000..53daa33 --- /dev/null +++ b/quant/binary/optimal.py @@ -0,0 +1,155 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Helper functions for calculating optimal binary quantization.""" + +from typing import Tuple + +import torch +import torch.nn.utils.rnn as rnn_utils + +from quant.binary.ste import binary_sign + + +def cost_function(matrix: torch.Tensor, v1s: torch.Tensor, ternary: bool = False) -> torch.Tensor: + """ + Compute the cost function to find the optimal v1. + + The cost function is equation (8) in the paper, for k=2. + It can be derived by expanding s1, s2 using the foldable quantization equation (9). + + Args: + matrix: original 2D tensor + v1s: 2D tensor containing potential optimal solutions + ternary: compute cost for ternary function + + Returns: + Norms as a 2D tensor + """ + matrix_view = matrix.view(matrix.shape[0], 1, -1) + v1s_view = v1s.view(v1s.shape[0], v1s.shape[1], 1) + s2_arg = matrix_view - v1s_view * binary_sign(matrix_view) + if ternary: + v2 = v1s_view + else: + v2 = s2_arg.abs().mean(dim=-1, keepdim=True) + return torch.norm(s2_arg - v2 * binary_sign(s2_arg), dim=-1) # type: ignore + + +def compute_mask(matrix: torch.Tensor, ternary: bool = False) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Compute mask for a 2D tensor of absolute values. + + The mask reveals potential optimal values. + + Args: + matrix: A 2D tensor of absolute values. + ternary: whether we are computing mask for ternary algorithm + + Returns: + A 2-tuple of tensors, where the first element is a mask + tensor and the second element are values selected + """ + values, _ = torch.sort(matrix, dim=1) + cum_sums = values.cumsum(dim=1) + + # store counts of elements at the corresponding position + counts = torch.arange(1, matrix.shape[1] + 1, device=matrix.device) + counts_rev = torch.flip(counts, [0]) - 1 + counts_rev[-1] = 1 # avoid division by 0, value at this pos. will not be used + + m1s = None + if not ternary: + # m1s stores cumulative means from left to right (chopping left and right most values) + m1s = (cum_sums / counts)[:, 1:-1] + # m2s stores cumulative means from right to left (chopping left and right most values) + m2s = ((cum_sums[:, -1:] - cum_sums) / counts_rev)[:, 1:-1] + + # re-using m1s and m2s to save memory + # using m1s and m2s values to find potential optimal solutions to v1 and v2 + if not ternary: + m1s = 0.5 * (m1s + m2s) + m2s = 0.5 * m2s + # Find potential solutions in inner region and boundary + # Instead of finding equality, find index where m1s or m2s + # is >= than everything on the left and <= than everything on the right + mask = (values[:, 1:-1] <= m2s) * (m2s <= values[:, 2:]) + if not ternary: + mask = mask + (values[:, 1:-1] <= m1s) * (m1s <= values[:, 2:]) + + masked_vs = torch.masked_select(values[:, 1:-1], mask) + return mask, masked_vs + + +def _handle_ternary_min_gt_half_avg( + matrix: torch.Tensor, masked_vs: torch.Tensor, split_sizes: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor]: + """Handle edge case in ternary case when min value is less than half of average.""" + # Suppose x is the absolute value of the tensor to be quantized + # For least squares, 2 bits, the optimal v will always be between min(x) and max(x) + # For the ternary case, the optimal v could be < min(x) + # This occurs when min(x) > 1/2 * avg(x) + # When this occurs, then we should append 1/2 * avg(x) as a solution + rows_mean = matrix.mean(dim=1) + rows_min, _ = matrix.min(dim=1) + row_min_gt_half_avg = rows_min > 0.5 * rows_mean + + if not torch.any(row_min_gt_half_avg): + # This should almost always be the case + return masked_vs, split_sizes + + # This should rarely happen if at all (e.g., when all elements are equal) + new_masked_vs = [] + masked_vs_list = masked_vs.tolist() + current_pos = 0 + for i, v in enumerate(row_min_gt_half_avg): + if split_sizes[i] > 0: + new_masked_vs.extend( + masked_vs_list[current_pos:current_pos + int(split_sizes[i].item())] + ) + current_pos += int(split_sizes[i].item()) + + if v: + split_sizes[i] += 1 + new_masked_vs.append(rows_mean[i].item() / 2) + + return torch.tensor(new_masked_vs, device=matrix.device), split_sizes + + +def opt_v1(matrix: torch.Tensor, ternary: bool, skip: int = 1) -> torch.Tensor: # type: ignore + """ + Implement the algorithm to find v1 for least squares 2-bit and ternary algorithm. + + Args: + matrix: A 2D tensor + ternary: whether to do ternary optimization + skip: increment in potential solution space to speed up computation + + Returns: + Optimal v1 + """ + with torch.no_grad(): + matrix_skipped = matrix[..., ::skip].abs() + mask, masked_vs = compute_mask(matrix_skipped, ternary) + + # masked_vs is a vector, we need to separate it into potential + # optimal solutions by row (dim 0) + split_sizes = mask.sum(dim=1) + + if ternary: + # handle a special case for ternary that rarely occurs + masked_vs, split_sizes = _handle_ternary_min_gt_half_avg( + matrix_skipped, masked_vs, split_sizes + ) + + vs = torch.split(masked_vs, split_sizes.tolist()) # type: ignore + vs = rnn_utils.pad_sequence(vs, batch_first=True) # type: ignore + + costs = cost_function(matrix_skipped, vs, ternary) + indices = torch.argmin(costs, dim=-1, keepdim=True) + + v1 = torch.gather(vs, 1, indices) + + return v1 diff --git a/quant/binary/quantization.py b/quant/binary/quantization.py new file mode 100644 index 0000000..b6792dd --- /dev/null +++ b/quant/binary/quantization.py @@ -0,0 +1,148 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Quantization functions and classes.""" + +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn + +from quant.binary.optimal import opt_v1 +from quant.binary.ste import binarize, binary_sign + + +def clamp_identity(x: torch.Tensor) -> torch.Tensor: + """Identity clamp.""" + return x + + +def clamp_symmetric(x: torch.Tensor, alpha: float) -> torch.Tensor: + """Clamp x to [-alpha, +alpha].""" + return x.clamp(-alpha, alpha) + + +class QuantizerFP(nn.Module): + """Weight / activation quantizer using full precision.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of full-precision quantizer.""" + return x + + +def quantizer_ls_1( + x: torch.Tensor, v1: Optional[torch.Tensor] = None +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Return (scaling factors, 1-bit optimal least-squares scaled binary quantization). + + If v1 is provided, it is directly used to compute the quantization. + If v1 is not provided, it is computed as well. + + Reference: + Rastegari, Mohammad, et al. + "Xnor-net: Imagenet classification using binary convolutional neural networks." + European conference on computer vision. Springer, Cham, 2016. + + Args: + x: A 4D tensor + v1: A vector of scaling factors + """ + x_data = x.clone().detach() + if v1 is None: + v1 = x_data.abs().mean(dim=-1).mean(dim=-1).mean(dim=-1) + return v1, v1.view(-1, 1, 1, 1) * binarize(x) + + +def quantizer_ls_2( + x: torch.Tensor, + v1: Optional[torch.Tensor] = None, + v2: Optional[torch.Tensor] = None, + skip: int = 3, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Return (v1, v2, 2-bits optimal least-squares scaled binary quantization). + + If v1 is provided, it is directly used to compute v2 and the quantization. + If v1 is not provided, it is computed as well. + + Args: + x: A 4D tensor + v1: A vector of scaling factors, v1 + v2: A vector of scaling factors, v2 + skip: increment in potential solution space to speed up computation + """ + x_data = x.view(x.shape[0], -1).clone().detach() + if v1 is None: + v1 = opt_v1(x_data, ternary=False, skip=skip) + else: + v1 = v1.view(-1, 1) + + if v2 is None: + residual = x_data - v1 * binary_sign(x_data) + v2 = residual.abs().mean(dim=-1, keepdim=True) + else: + v2 = v2.view(-1, 1) + + v1_reshaped = v1.view(x.shape[0], 1, 1, 1) + b1 = binarize(x) + return v1.view(-1), v2.view(-1), \ + v1_reshaped * b1 + v2.view(x.shape[0], 1, 1, 1) * binarize(x - v1_reshaped * b1) + + +def quantizer_ls_ternary( + x: torch.Tensor, v1: Optional[torch.Tensor] = None, skip: int = 3 +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Return (v1 scaling factors, optimal ternary least-squares scaled binary quantization). + + If v1 is provided, it is directly used to compute the quantization (v2 = v1). + If v1 is not provided, it is computed as well. + + Args: + x: A 4D tensor + v1: A vector of scaling factors, v1 + skip: increment in potential solution space to speed up computation + """ + x_data = x.view(x.shape[0], -1).clone().detach() + if v1 is None: + v1 = opt_v1(x_data, ternary=True, skip=skip) + + v1_reshaped = v1.view(x.shape[0], 1, 1, 1) + b1 = binarize(x) + return v1.view(-1), v1_reshaped * (b1 + binarize(x - v1_reshaped * b1)) + + +def quantizer_gf( + x: torch.Tensor, k: int, vs: Optional[List[torch.Tensor]] = None +) -> Tuple[List[torch.Tensor], torch.Tensor]: + """ + Return (List of greedy v_is, greedy foldable quantization with k-bits). + + Args: + x: A 4D tensor + k: Number of bits + vs: Scaling factors v_1 to v_k. + The tensor at position i in the list represents the tensor representing v_{i+1}. + """ + if vs is not None: + if len(vs) != k: # pragma: no cover + raise ValueError( + 'If vs is passed in, all vs from v_1 to v_k must be passed in (could be None).' + ) + + residual = x.view(x.shape[0], -1).clone().detach() + result = 0 + saved_vs = [] + for i in range(k): + if vs is not None: + v = vs[i] + else: + v = residual.abs().mean(dim=-1) + saved_vs.append(v) + residual = residual - v.view(-1, 1) * binary_sign(residual) + result = result + v.view(-1, 1, 1, 1) * binarize(x - result) + + return saved_vs, result # type: ignore diff --git a/quant/binary/ste.py b/quant/binary/ste.py new file mode 100644 index 0000000..1c8b68b --- /dev/null +++ b/quant/binary/ste.py @@ -0,0 +1,70 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Straight-through estimator.""" + +from typing import Any, NewType + +import torch +from torch.autograd import Function + +BinaryTensor = NewType('BinaryTensor', torch.Tensor) # A type where each element is in {-1, 1} + + +def binary_sign(x: torch.Tensor) -> BinaryTensor: + """Return -1 if x < 0, 1 if x >= 0.""" + return x.sign() + (x == 0).type(torch.float) # type: ignore + + +class STESign(Function): + """ + Binarize tensor using sign function. + + Straight-Through Estimator (STE) is used to approximate the gradient of sign function. + + See: + Bengio, Yoshua, Nicholas Léonard, and Aaron Courville. + "Estimating or propagating gradients through stochastic neurons for + conditional computation." arXiv preprint arXiv:1308.3432 (2013). + """ + + @staticmethod + def forward(ctx: Any, x: torch.Tensor) -> BinaryTensor: # type: ignore + """ + Return a Sign tensor. + + Args: + ctx: context + x: input tensor + + Returns: + Sign(x) = (x>=0) - (x<0) + Output type is float tensor where each element is either -1 or 1. + """ + ctx.save_for_backward(x) + sign_x = binary_sign(x) + return sign_x + + @staticmethod + def backward(ctx: Any, grad_output: torch.Tensor) -> torch.Tensor: # type: ignore # pragma: no cover (since this is called by C++ code) # noqa: E501 + """ + Compute gradient using STE. + + Args: + ctx: context + grad_output: gradient w.r.t. output of Sign + + Returns: + Gradient w.r.t. input of the Sign function + """ + x, = ctx.saved_tensors + grad_input = grad_output.clone() + grad_input[x.gt(1)] = 0 + grad_input[x.lt(-1)] = 0 + return grad_input + + +# Convenience function to binarize tensors +binarize = STESign.apply # type: ignore diff --git a/quant/binary/weight_quantization.py b/quant/binary/weight_quantization.py new file mode 100644 index 0000000..b7d193c --- /dev/null +++ b/quant/binary/weight_quantization.py @@ -0,0 +1,109 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Weight quantization.""" + +import torch +import torch.nn as nn + +import quant.binary.quantization as quantization + + +class WeightQuantizerLS1(nn.Module): + """ + Weight quantizer using least squares, 1 bit. + + In training mode, the optimal scalars are computed and cached. + In eval mode, the cached scalars are used to compute the quantization. + """ + + def __init__(self, size: int) -> None: + """Construct a weight quantizer using least squares with 1 bit.""" + super(WeightQuantizerLS1, self).__init__() + self.register_buffer('v1', torch.tensor([0.0] * size)) + + def forward(self, w: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of quantizing weight using least squares 1 bit.""" + if self.training: + v1, w_q = quantization.quantizer_ls_1(w) + self.v1.copy_(v1) # type: ignore + else: + _, w_q = quantization.quantizer_ls_1(w, self.v1) # type: ignore + return w_q + + +class WeightQuantizerLS2(nn.Module): + """ + Weight quantizer using least squares, 2 bits. + + In training mode, the optimal scalars are computed and cached. + In eval mode, the cached scalars are used to compute the quantization. + """ + + def __init__(self, size: int) -> None: + """Construct a weight quantizer using least squares with 2 bits.""" + super(WeightQuantizerLS2, self).__init__() + self.register_buffer('v1', torch.tensor([0.0] * size)) + self.register_buffer('v2', torch.tensor([0.0] * size)) + + def forward(self, w: torch.Tensor, skip: int = 3) -> torch.Tensor: # type: ignore + """Forward pass of quantizing weight using least squares 2 bits.""" + if self.training: + v1, v2, w_q = quantization.quantizer_ls_2(w, skip=skip) + self.v1.copy_(v1) # type: ignore + self.v2.copy_(v2) # type: ignore + else: + _, _, w_q = quantization.quantizer_ls_2(w, self.v1, self.v2, skip=skip) # type: ignore + return w_q + + +class WeightQuantizerLST(nn.Module): + """ + Weight quantizer using least squares, ternary. + + In training mode, the optimal scalars are computed and cached. + In eval mode, the cached scalars are used to compute the quantization. + """ + + def __init__(self, size: int) -> None: + """Construct a weight quantizer using least squares ternary.""" + super(WeightQuantizerLST, self).__init__() + self.register_buffer('v1', torch.tensor([0.0] * size)) + + def forward(self, w: torch.Tensor, skip: int = 3) -> torch.Tensor: # type: ignore + """Forward pass of quantizing weight using least squares ternary.""" + if self.training: + v1, w_q = quantization.quantizer_ls_ternary(w, skip=skip) + self.v1.copy_(v1) # type: ignore + else: + _, w_q = quantization.quantizer_ls_ternary(w, self.v1, skip=skip) # type: ignore + return w_q + + +class WeightQuantizerGF(nn.Module): + """ + Weight greedy foldable quantizer. + + In training mode, the optimal scalars are computed and cached. + In eval mode, the cached scalars are used to compute the quantization. + """ + + def __init__(self, size: int, k: int) -> None: + """Construct a greedy-foldable quantizer with `k`-bits.""" + super(WeightQuantizerGF, self).__init__() + self.k = k + for i in range(1, k + 1): + self.register_buffer(f'v{i}', torch.tensor([0.0] * size)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of greedy foldable quantizer with `k`-bits.""" + if self.training: + vs, x_q = quantization.quantizer_gf(x, k=self.k) + for i in range(self.k): + getattr(self, f'v{i+1}').copy_(vs[i]) + else: + vs = [getattr(self, f'v{i+1}') for i in range(self.k)] + _, x_q = quantization.quantizer_gf(x, k=self.k, vs=vs) + return x_q diff --git a/quant/common/__init__.py b/quant/common/__init__.py new file mode 100644 index 0000000..3975959 --- /dev/null +++ b/quant/common/__init__.py @@ -0,0 +1,23 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Common utilities and infrastructure for Quant.""" + +import logging + + +def init_logging(log_level: str) -> None: + """ + Initialize the logger. + + Args: + log_level (str): logging level, e.g. DEBUG, INFO, WARNING. + """ + level_map = { + 'DEBUG': logging.DEBUG, + 'INFO': logging.INFO, + 'WARNING': logging.WARNING, + } + logging.basicConfig(level=level_map[log_level]) diff --git a/quant/common/compute_platform.py b/quant/common/compute_platform.py new file mode 100644 index 0000000..5e57faf --- /dev/null +++ b/quant/common/compute_platform.py @@ -0,0 +1,114 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +A compute platform is an abstraction of a platform on which to run an experiment. + +The most common compute platform is just running something locally, +using :class:`LocalComputePlatform`. +However, :class:`ComputePlatform` can be subclassed to run experiments on +other platforms, such as GPU nodes on some cloud service. + +After instantiating a platform and an experiment, we just simply call +:meth:`ComputePlatform.run` to run the experiment on the platform. + +Driver scripts support the ``--restore-experiment `` option +to restore the latest checkpoint from a previous experiment. +""" + +from abc import ABC, abstractmethod +import os +from pathlib import Path +import subprocess +from typing import Callable, Optional + +from quant.common.experiment import Experiment +from quant.utils.utils import noop + + +def setup_restore_experiment(config: dict) -> Optional[Path]: + """Set the experiment path to restore experiment.""" + if 'restore_experiment' in config: + return Path(config['restore_experiment']) + return None + + +class ComputePlatform(ABC): + """Abstract class representing the compute platform to launch jobs from.""" + + def __init__(self, root_experiments_dir: str): + """ + Create a compute platform object. + + Args: + root_experiments_dir: root directory where experiments will be stored + """ + self.root_experiments_dir = Path(root_experiments_dir) + + @abstractmethod + def run(self, experiment: Experiment) -> None: + """ + Run an experiment on the compute platform. + + Args: + experiment: the experiment to run + """ + raise NotImplementedError + + +class LocalComputePlatform(ComputePlatform): + """Compute platform for running jobs on local machine.""" + + def __init__(self, root_experiments_dir: str): + """ + Create a compute local compute platform object. + + Args: + root_experiments_dir: root directory where experiments will be stored + """ + super(LocalComputePlatform, self).__init__(root_experiments_dir) + + def run( + self, + experiment: Experiment, + restore_experiment_setup: Callable[[dict], Optional[Path]] = setup_restore_experiment, + restore_experiment_cleanup: Callable[[dict], None] = noop, + ) -> None: + """ + Run an experiment function on local machine. + + Args: + experiment: the experiment to run + restore_experiment_setup: A function that sets + up the experiment directory to restore, defaults to no-op + restore_experiment_cleanup: A function that cleans up + the experiment directory to restore, defaults to no-op + """ + # Run TensorBoard process in background + if experiment.config['log'].get('tensorboard'): + tensorboard_port = os.environ.get('TENSORBOARD_PORT', '6006') + tensorboard_proc = subprocess.Popen( + [ + 'tensorboard', + '--logdir', + experiment.config['log']['tensorboard_root'], + '--port', + str(tensorboard_port), + '--bind_all', + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=os.environ.copy(), + ) + + # Actually launch experiment + experiment.run( + self.root_experiments_dir, + restore_experiment_setup, + restore_experiment_cleanup, + ) + + if experiment.config['log'].get('tensorboard'): + tensorboard_proc.terminate() diff --git a/quant/common/experiment.py b/quant/common/experiment.py new file mode 100644 index 0000000..3c1fbde --- /dev/null +++ b/quant/common/experiment.py @@ -0,0 +1,125 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +An experiment represents a single run of a task (model + data) in some configuration. + +An experiment is always run on some :class:`quant.common.compute_platform.ComputePlatform`. +It produces artifacts that can be used to reproduce the experiment, and logs +of the results, such as the evaluation metrics or TensorBoard logs. + +All experiments are stored in the `log.root_experiments_dir` specified in the config. +Each experiment has a name, which is by default the current datetime with the name +of the config. +However, a custom name can be specified by specifying ``--experiment_name `` at the CLI. +The artifacts related to an experiment is stored in a directory with the experiment name +in the `root_experiments_dir`. +""" + +from pathlib import Path +from typing import Callable, List, Optional, Tuple, Type +import yaml + +import pandas as pd + +from quant import Hook, MetricDict +from quant.utils.utils import noop +from quant.data.data_loaders import QuantDataLoader + + +def log_metrics_to_experiments_dir( + train_epoch_metrics: List[dict], + test_epoch_metrics: List[dict], + experiment_root_directory: Path, + experiment_name: str, + skip_training: bool = False, +) -> None: + """ + Log metrics to experiments directory. + + Args: + train_epoch_metrics: List of training metrics for every epoch + test_epoch_metrics: List of test metrics for every epoch + experiment_root_directory: root directory for storing logs, checkpoints, etc. + experiment_name: Name of experiment + skip_training: whether to log only eval metrics + """ + metrics_dir = experiment_root_directory / experiment_name / 'metrics' + metrics_dir.mkdir(exist_ok=True, parents=True) + + if not skip_training: + train_metrics_df = pd.DataFrame.from_records(train_epoch_metrics) + train_metrics_df.to_csv(metrics_dir / 'train.csv', index=False) + + test_metrics_df = pd.DataFrame.from_records(test_epoch_metrics) + test_metrics_df.to_csv(metrics_dir / 'test.csv', index=False) + + +class Experiment: + """A class representing an experiment.""" + + def __init__( + self, + task_fn: Callable, + config: dict, + data_loader_cls: Type[QuantDataLoader], + get_hooks: Callable[[dict, Path, MetricDict], Tuple[List[Hook], List[Hook]]], + ): + """ + Create an experiment. + + Args: + task_fn: A function that runs a task, such as classification_task + config: merged config with CLI args + data_loader_cls: The QuantDataLoader class + get_hooks: A function that returns a list of training and testing hooks + """ + self.task_fn = task_fn + self.config = config + self.data_loader_cls = data_loader_cls + self.get_hooks = get_hooks + self.name = config['experiment_name'] + + def run( + self, + logging_root_dir: Path, + restore_experiment_setup: Callable[[dict], Optional[Path]] = noop, + restore_experiment_cleanup: Callable[[dict], None] = noop, + ) -> None: + """ + Run the experiment. + + Args: + logging_root_dir: the root logging directory + restore_experiment_setup: A function that sets + up the experiment directory to restore, defaults to no-op + restore_experiment_cleanup: A function that cleans up + the experiment directory to restore, defaults to no-op + """ + experiments_dir = logging_root_dir / self.config['experiment_name'] + experiments_dir.mkdir(exist_ok=True, parents=True) + with open(experiments_dir / 'config.yaml', 'w') as f: + yaml.dump(self.config, f, default_flow_style=False) + + restored_experiment_path = restore_experiment_setup(self.config) + + train_epoch_metrics, test_epoch_metrics = self.task_fn( + self.config, + logging_root_dir, + self.data_loader_cls, + self.get_hooks, + restored_experiment_path, + ) + + # Write metrics to experiments directory + log_metrics_to_experiments_dir( + train_epoch_metrics, + test_epoch_metrics, + logging_root_dir, + self.name, + self.config['skip_training'] + ) + + restore_experiment_cleanup(self.config) diff --git a/quant/common/initialization.py b/quant/common/initialization.py new file mode 100644 index 0000000..a22bf2f --- /dev/null +++ b/quant/common/initialization.py @@ -0,0 +1,216 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Utilities for initializing device, model, optimizer, and LR scheduler.""" + +import copy +from typing import Callable, Dict, Iterator, List, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.optim.lr_scheduler as lr_scheduler + +from quant.utils.linear_lr_scheduler import LinearLR +from quant.models.lenet import QLeNet5 +from quant.models.resnet import QResNet + +model_mapping = { + 'lenet5': QLeNet5, + 'resnet': QResNet, +} + + +def get_loss_fn(loss: str) -> Callable[..., torch.Tensor]: + """ + Get loss function as a PyTorch functional loss based on the name of the loss function. + + Choices include 'cross_entropy', 'nll_loss', and 'kl_div'. + + Args: + loss: a string indicating the loss function to return. + """ + loss_fn_mapping: Dict[str, Callable[..., torch.Tensor]] = { + 'cross_entropy': F.cross_entropy, + 'nll_loss': F.nll_loss, + 'kl_div': F.kl_div, + } + + try: + loss_fn: Callable[..., torch.Tensor] = loss_fn_mapping[loss] + except KeyError: + raise ValueError(f'Loss function {loss} is not supported.') + + return loss_fn + + +def get_device( + ngpus: int, + seed: int = None, + cudnn_deterministic: bool = False, + cudnn_benchmark: bool = False, +) -> torch.device: + """ + Initialize PyTorch device and sets random seed. + + Args: + ngpus: Number of GPUs, 0 for CPU + seed: initial random seed for reproducibility + cudnn_deterministic: make CUDNN deterministic + cudnn_benchmark: use CUDNN auto-tuner + + Returns: + A PyTorch device object. + """ + use_cuda = ngpus > 0 and torch.cuda.is_available() + + if seed: + torch.manual_seed(seed) # type: ignore + + if use_cuda: # pragma: no cover + torch.backends.cudnn.deterministic = cudnn_deterministic # type: ignore + torch.backends.cudnn.benchmark = cudnn_benchmark # type: ignore + best_gpu_device_id = _get_best_gpus(1)[0] + # For data parallelism, parameters and buffers must be stored on the 1st device, devices[0] + # Here we ensure that we always return the first device id from the + # device ids available for DataParallel + device = torch.device(f'cuda:{best_gpu_device_id}') + else: + device = torch.device('cpu') + + return device + + +def _get_best_gpus(k: int) -> List[int]: + """Return the top k device ids associated with GPUs with the best compute capability.""" + # Select top ngpus based on CUDA device capability score + max_gpus = torch.cuda.device_count() + capabilities = [torch.cuda.get_device_capability(i) for i in range(max_gpus)] + ranked_device_ids = sorted(enumerate(capabilities), key=lambda t: t[1], reverse=True) + device_ids = [d[0] for d in ranked_device_ids][:k] + return device_ids + + +def get_model( + architecture: str, loss_fn: Callable[..., torch.Tensor], + arch_config: dict, device: torch.device, ngpus: int +) -> Union[nn.Module, nn.DataParallel]: + """ + Get model from config. + + Args: + architecture: model architecture + loss_fn: loss function in ``torch.nn.functional`` + arch_config: architecture config to be passed to model constructor + device: the device this model should be stored on + ngpus: the number of GPUs to use + + Returns: + A nn.Module object if for single GPU, or nn.DataParallel object if using multiple GPUs + """ + try: + model = model_mapping[architecture](loss_fn=loss_fn, **arch_config) + except KeyError: + raise ValueError(f'Model architecture {architecture} is not found.') + + max_gpus = torch.cuda.device_count() + if ngpus > max_gpus: + raise ValueError( + f"Device only has {max_gpus} GPUs, but {ngpus} are specified." + ) + + if ngpus > 1: + best_gpus = _get_best_gpus(ngpus) + model = nn.DataParallel(model, device_ids=best_gpus) + + model = model.to(device) + + return model + + +def get_optimizer(parameters: Iterator[nn.Parameter], config: dict) -> optim.Optimizer: # type: ignore # noqa: E501 + """ + Get an optimizer. + + Choices include 'sgd', 'adam', and 'sgd'. + + Args: + parameters: Parameters to optimize + config: A dictionary containing configurations for the optimizer. + It must have at minimum an 'algorithm' key and + `required arguments `_ + for the optimizer. + + Returns: + A PyTorch optimizer. + """ + config = copy.deepcopy(config) + algorithm = config.pop('algorithm') + + name_to_optimizer = { + 'adadelta': optim.Adadelta, # type: ignore + 'adam': optim.Adam, + 'sgd': optim.SGD, + } + + return name_to_optimizer[algorithm](parameters, **config) + + +def get_lr_scheduler( + optimizer: optim.Optimizer, config: dict, epochs: int, steps_per_epoch: int # type: ignore +) -> optim.lr_scheduler._LRScheduler: + """ + Get a LR scheduler. + + Choices include 'step_lr', 'multi_step_lr', 'linear_lr', and 'lambda_lr'. + + Typically in PyTorch, the learning rate scheduler calls `step()` after every epoch. + In this project, we call `step()` after every batch in every epoch. + Hence, parameters such as `step_lr` in `StepLR` and `milestones` in `MultiStepLR` + are scaled by the number of steps per epoch. + If you use `LambdaLR`, keep in mind that the lambda function takes the + global step (batch) index, not the epoch index. + + We have one custom learning rate scheduler, + :class:`~quant.common.linear_lr_scheduler.LinearLR`, that can be used by selecting `linear_lr`. + + All other schedulers are shipped with PyTorch. + + Args: + optimizer: Optimizer to adjust learning rate for + config: A dictionary containing configurations for the LR scheduler. + It must have at minimum a 'scheduler' key and + `args `_ + for the scheduler. + epochs: total number of epochs + steps_per_epoch: Steps (batches) per epoch + + Returns: + A PyTorch learning rate scheduler. + """ + config = copy.deepcopy(config) + scheduler = config.pop('scheduler') + + name_to_scheduler = { + 'linear_lr': LinearLR, + 'lambda_lr': lr_scheduler.LambdaLR, + 'step_lr': lr_scheduler.StepLR, + 'multi_step_lr': lr_scheduler.MultiStepLR, + } + + if scheduler == 'linear_lr': + config['steps_per_epoch'] = steps_per_epoch + config['total_epochs'] = epochs + config['min_lr'] = float(config['min_lr']) # YAML parses 2e-7 to a string instead of float + elif scheduler == 'lambda_lr': + config['lr_lambda'] = eval(config['lr_lambda']) + elif scheduler == 'step_lr': + config['step_size'] *= steps_per_epoch + elif scheduler == 'multi_step_lr': # pragma: no cover (coverage does not report it even though it's covered) # noqa: E501 + new_milestones = [epochs * steps_per_epoch for epochs in config['milestones']] + config['milestones'] = new_milestones + + return name_to_scheduler[scheduler](optimizer, **config) diff --git a/quant/common/metrics.py b/quant/common/metrics.py new file mode 100644 index 0000000..d375313 --- /dev/null +++ b/quant/common/metrics.py @@ -0,0 +1,218 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +This module contains classes for logging evaluation metrics. + +Metric is a stateful object that allows computing collective statistics +on an arbitrary subset of the dataset by implementing three methods: +:meth:`Metric.update`, :meth:`Metric.compute`, and :meth:`Metric.reset`. + +For example, :class:`TopKAccuracy` is an evaluation metric for computing the top-`k` accuracy. +All metrics should subclass the :class:`Metric` abstract class. +Each metric has three key methods for updating and getting its value: +:meth:`Metric.update`, :meth:`Metric.compute`, and :meth:`Metric.reset`. + +In each epoch, we generally need to iterate the dataset in batches. +Based on the predictions of examples in each batch, we need to update the value of +metrics accordingly. +The :meth:`Metric.update` method is used to update the metric +based on the results of new observations in each batch. +Depending on whether we set `accumulate` to `True` when creating the metric, +the metric is either accumulated with the result of the current batch or overwritten with it. + +Once we finish collecting the predictions, it is time to compute the value of the metric +of the whole dataset using :meth:`Metric.compute`. +Before moving on to the next epoch, we may want to :meth:`Metric.reset` the metric so +that we start evaluating on the predictions of the new epoch afresh. + +A typical structure looks like this:: + + for epoch in epochs: + metric.reset() + for batch_idx, (data, target) in enumerate(train_loader): + ... + metric.update(output, target) + + print('Metric value is:', metric.compute()) + +""" + +from abc import ABC, abstractmethod +from typing import Any, Callable, Optional + +import torch +from torch import Tensor + + +class Metric(ABC): + """Abstract class for an evaluation metric.""" + + DEFAULT_PRECISION = 4 + + def __init__(self, accumulate: bool) -> None: + """ + Create a metric object. + + Args: + accumulate: whether to accumulate metrics + """ + self.n_examples = 0 + self.total = 0.0 + self.accumulate = accumulate + + @abstractmethod + def update(self, output: Tensor, target: Tensor, **kwargs: Any) -> None: + """ + Update the evaluation metric based on the results of the current batch. + + Args: + output: the output of the model + target: the target we want the model to predict + """ + raise NotImplementedError + + def reset(self) -> None: + """Reset metric after every epoch.""" + self.n_examples = 0 + self.total = 0.0 + + @abstractmethod + def compute(self) -> float: + """ + Compute the overall evaluation metric once everything is done. + + Returns: + The final evaluation metric as a numeric value. + """ + raise NotImplementedError + + +class LossMetric(Metric): + """A metric for a loss criterion.""" + + def __init__(self, criterion: Callable[..., Tensor], accumulate: bool) -> None: + """ + Create a metric object for computing loss. + + Args: + criterion: loss function + accumulate: whether to accumulate metrics + """ + super(LossMetric, self).__init__(accumulate) + self.criterion = criterion + + def update(self, output: Tensor, target: Tensor, + teacher_output: Optional[Tensor] = None, **kwargs: Any) -> None: + """ + Update the loss metric based on the results of the current batch. + + Args: + output: the output of the model + target: the target we want the model to predict + teacher_output: teacher output for knowledge distillation + """ + kd_criterion = 0 + if teacher_output is not None: + kd_criterion = self.criterion(output, teacher_output, target).item() # type: ignore + + if self.accumulate: + self.n_examples += output.shape[0] + if teacher_output is None: + self.total += self.criterion(output, target, reduction='sum').item() + else: + self.total += kd_criterion * output.shape[0] # kd criterion uses batchmean + else: + if teacher_output is None: + self.total = self.criterion(output, target, reduction='mean').item() + else: + self.total = kd_criterion + + def compute(self) -> float: + """Compute the loss metric once everything is done.""" + return self.total / self.n_examples if self.accumulate else self.total + + def __str__(self) -> str: + """Get a string representation of the computed metric showing more detailed statistics.""" + return '{0:.{1}f}'.format(self.compute(), 8) + + +class Top1Accuracy(Metric): + """Top-1 accuracy metric.""" + + def __init__(self, accumulate: bool) -> None: + """Create a metric object for computing top-1 accuracy.""" + super(Top1Accuracy, self).__init__(accumulate) + + def update(self, output: Tensor, target: Tensor, **kwargs: Any) -> None: + """ + Update the top-1 accuracy based on the results of the current batch. + + Args: + output: the output of the model + target: the target we want the model to predict + """ + pred_top = output.argmax(dim=1, keepdim=True) + target = target.view_as(pred_top) + num_correct = pred_top.eq(target).sum().item() + if self.accumulate: + self.n_examples += output.shape[0] + self.total += num_correct + else: + self.n_examples = output.shape[0] + self.total = num_correct + + def compute(self) -> float: + """Compute the overall top-1 accuracy once everything is done.""" + return self.total / self.n_examples + + def __str__(self) -> str: + """Get a string representation of the computed metric showing more detailed statistics.""" + return '{0}/{1} ({2:.{3}f}%)'.format( + self.total, self.n_examples, 100 * self.compute(), self.DEFAULT_PRECISION + ) + + +class TopKAccuracy(Metric): + """Top-K accuracy metric.""" + + def __init__(self, k: int, accumulate: bool): + """ + Create a metric object for computing top-`k` accuracy. + + Args: + k: The "k" in top-`k` accuracy + accumulate: whether to accumulate metrics + """ + super(TopKAccuracy, self).__init__(accumulate) + self.k = k + + def update(self, output: Tensor, target: Tensor, **kwargs: Any) -> None: + """ + Update the top-`k` accuracy based on the results of the current batch. + + Args: + output: the output of the model + target: the target we want the model to predict + """ + _, pred_topk = torch.topk(output, dim=1, k=self.k) + if self.accumulate: + self.n_examples += output.shape[0] + self.total += ( + (target.view(-1, 1).expand_as(pred_topk) == pred_topk).sum().item() + ) + else: + self.n_examples = output.shape[0] + self.total = (target.view(-1, 1).expand_as(pred_topk) == pred_topk).sum().item() + + def compute(self) -> float: + """Compute the overall top-`k` accuracy once everything is done.""" + return self.total / self.n_examples + + def __str__(self) -> str: + """Get a string representation of the computed metric showing more detailed statistics.""" + return '{0}/{1} ({2:.{3}f}%)'.format( + self.total, self.n_examples, 100 * self.compute(), self.DEFAULT_PRECISION + ) diff --git a/quant/common/parser.py b/quant/common/parser.py new file mode 100644 index 0000000..428cf54 --- /dev/null +++ b/quant/common/parser.py @@ -0,0 +1,261 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +Configurations for Quant. + +All configurations are specified in YAML files. +Certain parameters such as the number of GPUs can be overridden or specified from the CLI. + +The config is divided into sections:: + + seed: (int or null) + environment: ... + data: ... + model: ... + optimization: ... + log: ... + +See `examples/mnist/mnist.yaml` for an example. + +The model architecture, loss criterion, optimizer, and learning rate scheduler are all specified +in the YAML config. + +Environment +^^^^^^^^^^^ + +This section specifies the computing environment and resources. + +`platform` should always be set to `local`. +One can subclass :class:`~quant.common.compute_platform.ComputePlatform` +and create alternate platforms to train models on (such as some cloud GPU server). +If you do this `platform` can be set to something else to distinguish it from `local`. + +`ngpus` specified the number of GPUs to use. + +The `cuda` subsection can be configured to set CUDA configurations, for example:: + + cuda: + cudnn_deterministic: false + cudnn_benchmark: true + +Data +^^^^ + +The data section sets the dataset location, batch sizes, and number of workers dataset loading. + +Here is an example:: + + data: + dataset_path: data/imagenet/ + train_batch_size: 256 + test_batch_size: 256 + workers: 16 + +Model +^^^^^ + +This section specifies the model architecture and loss:: + + model: + architecture: lenet5 + loss: nll_loss + arch_config: ... + +Supported architectures include: `lenet5` and `resnet`. +Supported loss functions include: `cross_entropy`, `nll_loss`, `kl_div`. +Architecture config stores keyword arguments passed to the model constructor. +See model constructor documentation (:class:`~quant.models.lenet.QLeNet5` or +:class:`~quant.models.resnet.QResNet`) for more info. + +For training with teacher, one can add another subsection under `model`, such as:: + + kd_config: + teacher_config_path: examples/imagenet/imagenet_fp.yaml + teacher_checkpoint_path: experiments/imagenet-teacher/checkpoints/checkpoint_100.pt + freeze_teacher: true + train_mode: true + criterion_config: + temperature: 1 + +Optimization +^^^^^^^^^^^^ + +This section specifies configurations for the optimizer and learning rate scheduler, for example:: + + optimization: + epochs: 14 + optimizer: + algorithm: adadelta + lr: 1.0 + lr_scheduler: + scheduler: step_lr + step_size: 1 + gamma: 0.7 + +Optimization algorithms (`algorithm`) support include: `sgd`, `adam`, `adadelta`. +All other key-value pairs under `optimizer` are passed directly as keyword arguments to +the corresponding PyTorch optimizer class's constructor: +https://pytorch.org/docs/stable/optim.html#algorithms. + +Learning rate scheduler (`scheduler`) support include: +`linear_lr`, `lambda_lr`, `step_lr`, and `multi_step_lr`. +All other key-value pairs under `lr_scheduler` are passed directly as keyword arguments to +the corresponding PyTorch LR scheduler class's constructor: +https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate. +See more details about the scheduler configurations at +:meth:`~quant.common.initialization.get_lr_scheduler`. + +Log +^^^ + +This section specifies the configurations for logging, checkpointing, and visualization. + +A sample config looks like this:: + + log: + level: INFO + interval: 100 + tensorboard: true + tensorboard_root: runs/ + root_experiments_dir: experiments/ + save_model_freq: 20 + +`interval` is number of batches per print of the current metrics to STDOUT and TensorBoard. + +If `tensorboard` is true, TensorBoard will be used to visualize metrics. +`tensorboard_root` is the location of all TensorBoard logs. The location for +visualization logs of one experiment will be under a subdirectory with the experiment name. + +`root_experiments_dir` is the root location for storing all experiment logs. +The logs for one experiment will be stored under a subdirectory with the experiment name, +which can be passed in via the CLI (or omit to use default). + +The experiment directory will contain the resolved config, overall metrics, checkpoints, +and copy of TensorBoard logs. + +`save_model_freq` is the number of epochs between saving checkpoints. +The last epoch is always saved. +""" + +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Callable +import yaml + +import torch + + +def _validate_args(args: Namespace) -> None: + """ + Validate arguments. + + Args: + args: parsed argparse CLI args + """ + if not args.restore_experiment and not args.config: + raise ValueError('--config must be specified if not restoring from experiment.') + + if args.restore_experiment and args.init_from_checkpoint: + raise ValueError('Only one of --restore-experiment / --init-from-checkpoint can be set.') + + +def parse_common_fields(args: Namespace, config: dict) -> None: + """ + Populate common fields in the config with parsed args. + + Args: + args: parsed argparse CLI args + config: config dictionary storing final resolved args + """ + if args.experiment_name is not None: + config['experiment_name'] = args.experiment_name + else: + from datetime import datetime + + current_time = datetime.now().strftime('%b%d_%H-%M-%S') + config_name_without_ext = Path(config['config']).stem + config['experiment_name'] = f'{current_time}_{config_name_without_ext}' + + if 'environment' not in config or 'platform' not in config['environment']: + config['environment'] = {'platform': 'local'} + + if args.ngpus is not None: + config['environment']['ngpus'] = args.ngpus + if 'ngpus' not in config['environment']: + config['environment']['ngpus'] = 1 if torch.cuda.is_available() else 0 + + config['skip_training'] = args.skip_training + + if args.init_from_checkpoint: + config['init_from_checkpoint'] = args.init_from_checkpoint + + +def parse_config(args: Namespace, validator: Callable[[Namespace], None] = _validate_args) -> dict: + """ + Parse config file and override with CLI args. + + Args: + args: parsed argparse CLI args + validator: validator for config + + Returns: + A resolved config, applying CLI args on top of the config file + """ + validator(args) + + config = {} + if args.restore_experiment: + with open(Path(args.restore_experiment) / 'config.yaml') as f: + config = yaml.safe_load(f) + + if args.config: + with open(args.config) as f: + config = yaml.safe_load(f) + config['config'] = args.config + + parse_common_fields(args, config) + + if args.restore_experiment: + config['restore_experiment'] = args.restore_experiment + + return config + + +def get_base_argument_parser(description: str) -> ArgumentParser: + """ + Get a base argument parser for driver scripts. + + Args: + description: A string describing the driver script. + + Returns: + Parser object to extend. + """ + parser = ArgumentParser(description) + parser.add_argument('--config', type=str, help='Path to a yaml config file.') + parser.add_argument( + '--experiment-name', type=str, default=None, help='Name of the experiment.' + ) + parser.add_argument( + '--ngpus', type=int, default=None, help='Number of GPUs. Use 0 for CPU.' + ) + parser.add_argument( + '--skip-training', + default=False, + action='store_true', + help='Skip training and only run evaluation. Checkpoint must be passed in as well.', + ) + parser.add_argument( + '--restore-experiment', + type=str, + help='Path to experiments directory to restore checkpoint from.', + ) + parser.add_argument( + '--init-from-checkpoint', + type=str, + help='Path to model file to initialize model parameters.', + ) + return parser diff --git a/quant/common/tasks.py b/quant/common/tasks.py new file mode 100644 index 0000000..995cf7b --- /dev/null +++ b/quant/common/tasks.py @@ -0,0 +1,232 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Utilities for running tasks.""" + +from functools import partial +from pathlib import Path +from typing import Callable, Dict, List, Optional, Tuple, Type, Union +import yaml + +import torch +import torch.nn as nn + +from quant import Hook, MetricDict +from quant.common import init_logging +from quant.utils.checkpoints import get_path_to_checkpoint, log_checkpoints, \ + restore_from_checkpoint +from quant.common.initialization import ( + get_device, + get_model, + get_optimizer, + get_lr_scheduler, + get_loss_fn, +) +from quant.common.metrics import LossMetric, Top1Accuracy, TopKAccuracy +from quant.common.training import train, evaluate +from quant.data.data_loaders import QuantDataLoader +from quant.utils.kd_criterion import kd_criterion + + +def get_teacher_and_kd_loss( + teacher_config_path: str, + teacher_checkpoint_path: str, + train_mode: bool, + criterion_config: dict, + device: torch.device, + ngpus: int, + freeze_teacher: bool = True, + strict_keys: bool = True, +) -> Tuple[Union[nn.Module, nn.DataParallel], Callable[..., torch.Tensor]]: + """ + Get teacher and KD loss for knowledge distillation. + + Args: + teacher_config_path: path to config used to train teacher + teacher_checkpoint_path: path to checkpoint to use to initialize teacher + train_mode: if true, use teacher in train mode, or use eval mode otherwise + criterion_config: config for KD criterion, such as alpha and temperature + device: PyTorch device used to store teacher, should the be the same as model + ngpus: number of GPUs to run teacher, should be the same as that of the student model + freeze_teacher: whether to freeze teacher + strict_keys: whether to enforce keys must exactly match for restoring checkpoint + + Returns: + An initialized teacher and KD loss function with teacher-related args resolved + """ + with open(teacher_config_path) as f: + teacher_config = yaml.safe_load(f) + teacher_model_config = teacher_config['model'] + + loss_fn = get_loss_fn(teacher_model_config['loss']) + teacher = get_model( + architecture=teacher_model_config['architecture'], + loss_fn=loss_fn, + arch_config=teacher_model_config['arch_config'], + device=device, + ngpus=ngpus, + ) + + restore_from_checkpoint(teacher, None, None, teacher_checkpoint_path, device, strict_keys) + + if freeze_teacher: + for p in teacher.parameters(): + p.requires_grad_(False) + + teacher.train() if train_mode else teacher.eval() + + kd_loss = partial(kd_criterion, freeze_teacher=freeze_teacher, **criterion_config) + + return teacher, kd_loss + + +def classification_task( + config: dict, + experiment_root_directory: Path, + data_loader_cls: Type[QuantDataLoader], + get_hooks: Callable[[dict, Path, MetricDict, MetricDict], Tuple[List[Hook], List[Hook]]], + restore_experiment: Optional[Path] = None, +) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]: + """ + Driver program for running classification task. + + Args: + config: merged config with CLI args + experiment_root_directory: root directory for storing logs, checkpoints, etc. + data_loader_cls: The QuantDataLoader class + get_hooks: a function that returns lists of training and testing hooks + restore_experiment: path to experiment to restore, None for do not restore + + Returns: + (List of training set metrics for each epoch, list of test set metrics for each epoch). + """ + env_config = config['environment'] + data_config = config['data'] + model_config = config['model'] + optimization_config = config['optimization'] + log_config = config['log'] + + init_logging(log_config['level']) + + device = get_device(env_config['ngpus'], config.get('seed'), **env_config.get('cuda', {})) + + data_loader = data_loader_cls(**data_config) + train_loader = data_loader.get_train_loader() if not config.get('skip_training') else None + test_loader = data_loader.get_test_loader() + + epochs = optimization_config['epochs'] + + teacher = None + use_kd = 'kd_config' in model_config + if use_kd: + teacher, kd_loss = get_teacher_and_kd_loss( + device=device, ngpus=env_config['ngpus'], + strict_keys=model_config.get('strict_keys', True), + **model_config['kd_config'] + ) + + loss_fn = get_loss_fn(model_config['loss']) if not use_kd else kd_loss + model = get_model( + architecture=model_config['architecture'], + loss_fn=loss_fn, + arch_config=model_config['arch_config'], + device=device, + ngpus=env_config['ngpus'], + ) + + optimizer, scheduler = None, None + if not config.get('skip_training'): + optimizer = get_optimizer(model.parameters(), optimization_config['optimizer']) + scheduler = get_lr_scheduler(optimizer, optimization_config['lr_scheduler'], epochs, len(train_loader)) # type: ignore # noqa: E501 + + if restore_experiment is not None: + checkpoint_path = get_path_to_checkpoint(restore_experiment) + model, restored_optimizer, restored_scheduler, start_epoch = restore_from_checkpoint( + model, + optimizer, + scheduler, + checkpoint_path, + device, + model_config.get('strict_keys', True), + ) + optimizer, scheduler = restored_optimizer, restored_scheduler + start_epoch += 1 + elif config.get('init_from_checkpoint'): + model, _, _, _ = restore_from_checkpoint( + model, + None, + None, + config['init_from_checkpoint'], + device, + model_config.get('strict_keys', True), + ) + start_epoch = 1 + else: + start_epoch = 1 + + train_metrics = { + 'Loss': LossMetric(loss_fn, accumulate=True), + 'Top-1 Accuracy': Top1Accuracy(accumulate=True), + 'Top-5 Accuracy': TopKAccuracy(5, accumulate=True), + } + + test_metrics = { + 'Loss': LossMetric(get_loss_fn(model_config['loss']), accumulate=True), + 'Top-1 Accuracy': Top1Accuracy(accumulate=True), + 'Top-5 Accuracy': TopKAccuracy(5, accumulate=True), + } + + train_hooks, test_hooks = get_hooks(config, experiment_root_directory, + train_metrics, test_metrics) + train_epoch_metrics, test_epoch_metrics = [], [] + + if config.get('skip_training'): + computed_test_metrics = evaluate( + model=model, + test_loader=test_loader, + metrics=test_metrics, + device=device, + epoch=1, + hooks=test_hooks, + ) + test_epoch_metrics.append(computed_test_metrics) + else: + for epoch in range(start_epoch, start_epoch + epochs): + computed_train_metrics = train( + model=model, + train_loader=train_loader, # type: ignore + metrics=train_metrics, + optimizer=optimizer, + scheduler=scheduler, # type: ignore + device=device, + epoch=epoch, + log_interval=log_config['interval'], + hooks=train_hooks, + teacher=teacher, + ) + computed_test_metrics = evaluate( + model=model, + test_loader=test_loader, + metrics=test_metrics, + device=device, + epoch=epoch, + hooks=test_hooks, + ) + + train_epoch_metrics.append(computed_train_metrics) + test_epoch_metrics.append(computed_test_metrics) + + if epoch % log_config['save_model_freq'] == 0 or epoch == epochs: + log_checkpoints( + experiment_root_directory / config['experiment_name'] / 'checkpoints', + model, + optimizer, # type: ignore + scheduler, # type: ignore + epoch, + ) + + data_loader.cleanup() + + return train_epoch_metrics, test_epoch_metrics diff --git a/quant/common/training.py b/quant/common/training.py new file mode 100644 index 0000000..52e6d68 --- /dev/null +++ b/quant/common/training.py @@ -0,0 +1,204 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +Quant provides generic training and test loops that can be used for all datasets. + +Training and test loops both support hooks, which are functions that are called +inside each batch of each epoch. This allows different driver scripts to use +the same training and test loops, sharing the same structure, while making it possible +to introduce custom behavior. + +Each hook can take a variable number of keyword arguments, +They will always be given `epoch` and `global_step`. +`epoch` is an integer, starting from 1, that represents the current epoch. +`global_step` is a unique, incrementing counter for every batch of every epoch. +It starts at `1` and goes to `num_epochs * ceil(dataset_size / batch_size)`, inclusive. +Hooks can use the other keyword arguments to implement custom behavior. + +One example of a hook implemented in the library is the visualization hook +that supports logging metrics to be viewed via TensorBoard: +:meth:`quant.common.visualization.Visualizer.hook` +""" + +import logging +from typing import Dict, Optional, Sequence, Union + +import torch +import torch.nn as nn +from torch.optim import Optimizer # type: ignore +from torch.optim.lr_scheduler import _LRScheduler +from torch.utils.data.dataloader import DataLoader + +from quant import Hook +from quant.common.metrics import Metric + + +logger = logging.getLogger(__name__) + + +def _get_lr(optimizer: Optimizer) -> float: + """ + Get learning rate of the first parameter group. + + Args: + optimizer (optim.Optimizer): PyTorch optimizer + """ + for param_group in optimizer.param_groups: + return param_group['lr'] + + raise ValueError('Cannot get optimizer LR: optimizer does not have any parameter groups.') + + +def project(optimizer: Optimizer) -> None: + """Project model parameters to a range so that they can be updated.""" + # No-op + # In theory, we should project the quantized weights to the [-1, 1] range + # so that we have non-zero gradients and they can be updated. + # However, in practice, we notice that this does not make a difference. + # Hence, this is a no-op. + _ = optimizer + return None + + +def train( + model: Union[nn.Module, nn.DataParallel], + train_loader: DataLoader, + metrics: Dict[str, Metric], + optimizer: Optimizer, + scheduler: _LRScheduler, + device: torch.device, + epoch: int, + log_interval: int, + hooks: Optional[Sequence[Hook]] = None, + teacher: Optional[Union[nn.Module, nn.DataParallel]] = None, +) -> Dict[str, float]: + """ + Train a model on some data using some criterion and with some optimizer. + + Args: + model: Model to train + train_loader: Data loader for loading training data + metrics: A dict mapping evaluation metric names to metrics classes + optimizer: PyTorch optimizer + scheduler: PyTorch scheduler + device: PyTorch device object + epoch: Current epoch, where the first epoch should start at 1 + log_interval: Number of batches before printing loss + hooks: A sequence of functions that can implement custom behavior + teacher: teacher network for knowledge distillation, if any + + Returns: + A dictionary mapping evaluation metric names to computed values for the training set. + """ + if hooks is None: + hooks = [] + + model.train() + for metric in metrics.values(): + metric.reset() + + loss_fn = model.module.loss_fn if isinstance(model, nn.DataParallel) else model.loss_fn + + seen_examples = 0 + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + if teacher is None: + teacher_output = None + loss = loss_fn(output, target) # type: ignore + else: + teacher_output = teacher(data) + loss = loss_fn(output, teacher_output, target) # type: ignore + loss.backward() + optimizer.step() + project(optimizer) + scheduler.step() # type: ignore + + with torch.no_grad(): + for metric in metrics.values(): + metric.update(output, target, teacher_output=teacher_output) + + for hook in hooks: + hook( + epoch=epoch, + global_step=1 + (epoch - 1) * len(train_loader.dataset) + batch_idx, + values_dict={'lr': _get_lr(optimizer)}, + log_interval=log_interval, + ) + + seen_examples += len(data) + if batch_idx % log_interval == 0: + logger.info( + 'Train Epoch: {} [{}/{} ({:.0f}%)]\tBatch Loss: {:.6f}'.format( + epoch, + seen_examples, + len(train_loader.dataset), + 100 * batch_idx / len(train_loader), + loss.item(), + ) + ) + + # Computing evaluation metrics for training set + computed_metrics = {name: metric.compute() for name, metric in metrics.items()} + + logger.info('Training set evaluation metrics:') + for name, metric in metrics.items(): + logger.info(f'{name}: {metric}') + + return computed_metrics + + +def evaluate( + model: Union[nn.Module, nn.DataParallel], + test_loader: DataLoader, + metrics: Dict[str, Metric], + device: torch.device, + epoch: int, + hooks: Optional[Sequence[Hook]] = None, +) -> Dict[str, float]: + """ + Evaluate model on some held-out set. + + Args: + model: Model to test on + test_loader: Data loader for loading test data + metrics: A dict mapping evaluation metric names to metrics classes + device: PyTorch device object + epoch: Current epoch, where the first epoch should start at 1 + hooks: A sequence of functions that can implement custom behavior + + Returns: + A dictionary mapping evaluation metric names to computed values. + """ + if hooks is None: + hooks = [] + + model.eval() + for metric in metrics.values(): + metric.reset() + + with torch.no_grad(): + for batch_idx, (data, target) in enumerate(test_loader): + data, target = data.to(device), target.to(device) + output = model(data) + + for metric in metrics.values(): + metric.update(output, target) + + for hook in hooks: + hook( + epoch=epoch, + global_step=1 + (epoch - 1) * len(test_loader.dataset) + batch_idx + ) + + computed_metrics = {name: metric.compute() for name, metric in metrics.items()} + + logger.info('Test set evaluation metrics:') + for name, metric in metrics.items(): + logger.info(f'{name}: {metric}') + + return computed_metrics diff --git a/quant/data/__init__.py b/quant/data/__init__.py new file mode 100644 index 0000000..20dc9ab --- /dev/null +++ b/quant/data/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""This subpackage contains dataset related code necessary to run this package.""" diff --git a/quant/data/data_loaders.py b/quant/data/data_loaders.py new file mode 100644 index 0000000..637bafd --- /dev/null +++ b/quant/data/data_loaders.py @@ -0,0 +1,375 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Data loaders for MNIST, CIFAR-10, CIFAR-100, and ImageNet datasets.""" + +from abc import ABC, abstractmethod +from pathlib import Path +import typing as t + +import torch +from torch.utils.data import Sampler +from torch.utils.data.dataloader import DataLoader +from torchvision import datasets, transforms + + +class QuantDataLoader(ABC): + """Abstract class from which to instantiate training and test set PyTorch data loaders.""" + + def __init__( + self, + train_batch_size: int, + test_batch_size: int, + dataset_path: str, + workers: int, + download: bool = True, + test_sampler: t.Optional[Sampler] = None, + ): + """ + Construct QuantDataLoader object, used for obtaining training and test set loaders. + + Args: + train_batch_size: training set batch size + test_batch_size: test set batch size + dataset_path: root location of the dataset + workers: number of workers to use for the data loader + download: whether to download dataset. + If false `dataset_path` should contain pre-downloaded dataset. + test_sampler: PyTorch data sampler for the test set + """ + self.train_batch_size = train_batch_size + self.test_batch_size = test_batch_size + self.dataset_path = dataset_path + self.workers = workers + self.download = download + self.test_sampler = test_sampler + + @abstractmethod + def get_train_loader(self) -> DataLoader: + """Get a PyTorch data loader for the training set.""" + raise NotImplementedError + + @abstractmethod + def get_test_loader(self) -> DataLoader: + """Get a PyTorch data loader for the test set.""" + raise NotImplementedError + + def cleanup(self) -> None: + """Clean up any temporary data.""" + pass + + +class MNISTDataLoader(QuantDataLoader): + """ + Subclass of :class:`~quant.data.data_loaders.QuantDataLoader`, for MNIST. + + If the `dataset_path` does not already have the dataset, it is downloaded from the web. + """ + + def __init__( + self, + train_batch_size: int, + test_batch_size: int, + dataset_path: str, + workers: int, + download: bool = True, + test_sampler: t.Optional[Sampler] = None, + ): + """Construct a class for getting MNIST data loaders.""" + super(MNISTDataLoader, self).__init__( + train_batch_size, + test_batch_size, + dataset_path, + workers, + download, + test_sampler, + ) + self.transform_fn = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ) + + def get_train_loader(self) -> DataLoader: + """Get a PyTorch data loader for the training set.""" + train_loader = torch.utils.data.DataLoader( + datasets.MNIST( + self.dataset_path, + train=True, + download=self.download, + transform=self.transform_fn, + ), + batch_size=self.train_batch_size, + shuffle=True, + ) + + return train_loader + + def get_test_loader(self) -> DataLoader: + """Get a PyTorch data loader for the test set.""" + test_loader = torch.utils.data.DataLoader( + datasets.MNIST( + self.dataset_path, + train=False, + download=self.download, + transform=self.transform_fn, + ), + batch_size=self.test_batch_size, + shuffle=False, + sampler=self.test_sampler, + ) + + return test_loader + + +class CIFAR10DataLoader(QuantDataLoader): + """ + Subclass of :class:`~quant.data.data_loaders.QuantDataLoader`, for CIFAR-10. + + If the `dataset_path` does not already have the dataset, it is downloaded from the web. + """ + + def __init__( + self, + train_batch_size: int, + test_batch_size: int, + dataset_path: str, + workers: int, + download: bool = True, + test_sampler: t.Optional[Sampler] = None, + ): + """Construct a class for getting CIFAR-10 data loaders.""" + super(CIFAR10DataLoader, self).__init__( + train_batch_size, + test_batch_size, + dataset_path, + workers, + download, + test_sampler, + ) + self.mean_val = (0.4914, 0.4822, 0.4465) + self.std_val = (0.2023, 0.1994, 0.2010) + + def get_train_loader(self) -> DataLoader: + """Get a PyTorch data loader for the training set.""" + transform_train = transforms.Compose( + [ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(self.mean_val, self.std_val), + ] + ) + + dataset_train = datasets.CIFAR10( + root=self.dataset_path, + train=True, + download=self.download, + transform=transform_train, + ) + + train_loader = torch.utils.data.DataLoader( + dataset_train, + batch_size=self.train_batch_size, + shuffle=True, + num_workers=self.workers, + pin_memory=True, + ) + + return train_loader + + def get_test_loader(self) -> DataLoader: + """Get a PyTorch data loader for the test set.""" + transform_test = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize(self.mean_val, self.std_val)] + ) + + dataset_test = datasets.CIFAR10( + root=self.dataset_path, + train=False, + download=self.download, + transform=transform_test, + ) + + test_loader = torch.utils.data.DataLoader( + dataset_test, + batch_size=self.test_batch_size, + shuffle=False, + sampler=self.test_sampler, + num_workers=self.workers, + pin_memory=True, + ) + + return test_loader + + +class CIFAR100DataLoader(QuantDataLoader): + """ + Subclass of :class:`~quant.data.data_loaders.QuantDataLoader`, for CIFAR-100. + + If the `dataset_path` does not already have the dataset, it is downloaded from the web. + """ + + def __init__( + self, + train_batch_size: int, + test_batch_size: int, + dataset_path: str, + workers: int, + download: bool = True, + test_sampler: t.Optional[Sampler] = None, + ): + """Construct a class for getting CIFAR-100 data loaders.""" + super(CIFAR100DataLoader, self).__init__( + train_batch_size, + test_batch_size, + dataset_path, + workers, + download, + test_sampler, + ) + self.mean_val = (0.507075159237, 0.4865488733149, 0.440917843367) + self.std_val = (0.267334285879, 0.2564384629170, 0.276150471325) + + def get_train_loader(self) -> DataLoader: + """Get a PyTorch data loader for the training set.""" + transform_train = transforms.Compose( + [ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(self.mean_val, self.std_val), + ] + ) + + dataset_train = datasets.CIFAR100( + root=self.dataset_path, + train=True, + download=self.download, + transform=transform_train, + ) + + train_loader = torch.utils.data.DataLoader( + dataset_train, + batch_size=self.train_batch_size, + shuffle=True, + num_workers=self.workers, + pin_memory=True, + ) + + return train_loader + + def get_test_loader(self) -> DataLoader: + """Get a PyTorch data loader for the test set.""" + transform_test = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize(self.mean_val, self.std_val)] + ) + + dataset_test = datasets.CIFAR100( + root=self.dataset_path, + train=False, + download=self.download, + transform=transform_test, + ) + + test_loader = torch.utils.data.DataLoader( + dataset_test, + batch_size=self.test_batch_size, + shuffle=False, + sampler=self.test_sampler, + num_workers=self.workers, + pin_memory=True, + ) + + return test_loader + + +class ImageNetDataLoader(QuantDataLoader): + """ + Subclass of :class:`~quant.data.data_loaders.QuantDataLoader`, for ImageNet. + + The dataset must already be available and cannot be downloaded by this data loader. + """ + + def __init__( + self, + train_batch_size: int, + test_batch_size: int, + dataset_path: str, + workers: int, + download: bool = False, + test_sampler: t.Optional[Sampler] = None, + train_split: str = 'train', + val_split: str = 'val', + ): + """Construct a class for getting ImageNet data loaders.""" + super(ImageNetDataLoader, self).__init__( + train_batch_size, + test_batch_size, + dataset_path, + workers, + download, + test_sampler, + ) + if download: + raise ValueError( + 'ImageNet must be downloaded manually due to licensing restrictions.' + ) + self.train_split = train_split + self.val_split = val_split + + self.normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + def get_train_loader(self) -> DataLoader: + """Get a PyTorch data loader for the training set.""" + train_dir = Path(self.dataset_path) / self.train_split + train_dataset = datasets.ImageFolder( + train_dir, + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ColorJitter(0.4, 0.4, 0.4), + transforms.ToTensor(), + self.normalize, + ] + ), + ) + + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=self.train_batch_size, + shuffle=True, + num_workers=self.workers, + pin_memory=True, + ) + + return train_loader + + def get_test_loader(self) -> DataLoader: + """Get a PyTorch data loader for the test set.""" + test_dir = Path(self.dataset_path) / self.val_split + test_dataset = datasets.ImageFolder( + test_dir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + self.normalize, + ] + ), + ) + + test_loader = torch.utils.data.DataLoader( + test_dataset, + batch_size=self.test_batch_size, + shuffle=False, + num_workers=self.workers, + pin_memory=True, + sampler=self.test_sampler, + ) + + return test_loader diff --git a/quant/models/__init__.py b/quant/models/__init__.py new file mode 100644 index 0000000..d016442 --- /dev/null +++ b/quant/models/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Models to use with Quant.""" diff --git a/quant/models/lenet.py b/quant/models/lenet.py new file mode 100644 index 0000000..284d5e0 --- /dev/null +++ b/quant/models/lenet.py @@ -0,0 +1,94 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +LeNet model. + +See http://yann.lecun.com/exdb/lenet/ for more details. +""" + +from typing import Callable, Dict, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from quant.binary.binary_conv import QuantConv2d + + +class QLeNet5(nn.Module): + """LeNet-5 model.""" + + def __init__( + self, + loss_fn: Callable[..., torch.Tensor], + conv1_filters: int = 20, + conv2_filters: int = 50, + output_classes: int = 10, + x_quant: str = 'fp', + w_quant: str = 'fp', + clamp: Optional[Dict] = None, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """ + Initialize weights and biases for LeNet model. + + Args: + loss_fn: loss function of the model + conv1_filters: number of convolutional feature maps of the first conv layer + conv2_filters: number of convolutional feature maps of the second conv layer + output_classes: number of output classes + x_quant: quantization scheme for activations, + see :mod:`~quant.binary.binary_conv`. + w_quant: quantization scheme for weights, + see :mod:`~quant.binary.binary_conv`. + clamp: clamping scheme for activations. + It should have a key named "kind" indicating the kind of clamping function + and other keys indicating other potential arguments. + See :mod:`~quant.binary.binary_conv`. + moving_average_mode: moving average mode to use + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + moving_average_momentum: momentum for moving average + update, see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + """ + super(QLeNet5, self).__init__() + # loss_fn is a loss function in torch.nn.functional + setattr(self, 'loss_fn', loss_fn) + + self.conv1_filters = conv1_filters + self.conv2_filters = conv2_filters + self.output_classes = output_classes + self.x_quant = x_quant + self.w_quant = w_quant + + self.conv1 = nn.Conv2d(1, conv1_filters, 5, stride=1) + self.bn_conv1 = nn.BatchNorm2d(conv1_filters, eps=1e-4, momentum=0.1, affine=False) + self.conv2 = QuantConv2d( + x_quant, w_quant, conv1_filters, conv2_filters, 5, + clamp, moving_average_mode, moving_average_momentum, stride=1 + ) + + self.bn_conv2 = nn.BatchNorm2d(conv1_filters, eps=1e-4, momentum=0.1, affine=False) + self.fc1 = nn.Linear(conv2_filters * 4 * 4, conv2_filters * output_classes) + self.fc2 = nn.Linear(conv2_filters * output_classes, output_classes) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of LeNet5 model.""" + # first layer full precision + x = self.conv1(x) + x = self.bn_conv1(F.relu(x, inplace=True)) + x = F.max_pool2d(x, kernel_size=2, stride=2) + + x = F.relu(self.conv2(self.bn_conv2(x)), inplace=True) + x = F.max_pool2d(x, kernel_size=2, stride=2) + + x = x.view(-1, self.conv2_filters * 4 * 4) + x = self.fc1(x) + x = F.relu(x, inplace=True) + + # last layer full precision + x = self.fc2(x) + return F.log_softmax(x, dim=1) diff --git a/quant/models/resnet.py b/quant/models/resnet.py new file mode 100644 index 0000000..2522594 --- /dev/null +++ b/quant/models/resnet.py @@ -0,0 +1,397 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +ResNet model. + +See `Deep Residual Learning for Image Recognition`_ for more details. + +.. _Deep Residual Learning for Image Recognition: https://arxiv.org/abs/1512.03385 +""" + +from typing import Callable, Dict, List, Optional, Union + +import torch +import torch.nn as nn + +from quant.binary.binary_conv import QuantConv2d + +non_linearity_map = { + 'relu': nn.ReLU, + 'prelu': nn.PReLU, + 'identity': nn.Identity, +} + + +class RegularBasicBlock(nn.Module): + """ResNet regular basic block.""" + + def __init__( + self, in_planes: int, planes: int, x_quant: str, w_quant: str, + nonlins: List[str], stride: int = 1, + clamp: Optional[Dict] = None, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """ + Build ResNet regular basic block. + + Args: + in_planes: the number of in-channels for the block + planes: the number of out-channels for the block + x_quant: quantization scheme for activations, + see :mod:`~quant.binary.binary_conv`. + w_quant: quantization scheme for weights, + see :mod:`~quant.binary.binary_conv`. + nonlins: non-linearities for the black. It should be a list of two + strings, where each string is in {'relu', 'prelu', 'identity'}. + stride: stride size + clamp: clamping scheme for activations. + It should have a key named "kind" indicating the kind of clamping function + and other keys indicating other potential arguments. + See :mod:`~quant.binary.binary_conv`. + moving_average_mode: moving average mode to use, + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + moving_average_momentum: momentum for moving average update, + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + """ + super(RegularBasicBlock, self).__init__() + if len(nonlins) != 2: + raise ValueError('There should be 2 non-linearities.') + + self.conv1 = QuantConv2d( + x_quant, w_quant, in_planes, planes, 3, clamp, + moving_average_mode, moving_average_momentum, stride=stride, padding=1, bias=False + ) + self.bn1 = nn.BatchNorm2d(planes) + self.nonlin1 = non_linearity_map[nonlins[0]]() + + self.conv2 = QuantConv2d( + x_quant, w_quant, planes, planes, 3, clamp, + moving_average_mode, moving_average_momentum, stride=1, padding=1, bias=False + ) + self.bn2 = nn.BatchNorm2d(planes) + self.nonlin2 = non_linearity_map[nonlins[1]]() + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != planes: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_planes, + planes, + kernel_size=1, + stride=stride, + bias=False, + ), + nn.BatchNorm2d(planes), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of RegularBasicBlock.""" + out = self.nonlin1(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out = out + self.shortcut(x) + out = self.nonlin2(out) + return out + + +class XnorBasicBlock(nn.Module): + """ + ResNet XNOR regular basic block. + + Block structure (BN -> Quant -> Conv -> NonLin): + + Rastegari, Mohammad, et al. + "Xnor-net: Imagenet classification using binary convolutional neural networks." + European conference on computer vision. Springer, Cham, 2016. + + Using double shortcuts: + + Zechun Liu, Baoyuan Wu, Wenhan Luo, Xin Yang, Wei Liu, and Kwang-Ting Cheng. + "Bi-real net: Enhancing the performance of 1-bit CNNs with improved representational + capability and advanced training algorithm." + In Proceedings of the European conference on computer vision (ECCV), pages 722–737, 2018. + """ + + def __init__( + self, in_planes: int, planes: int, x_quant: str, w_quant: str, + nonlins: List[str], stride: int = 1, double_shortcut: bool = False, + clamp: Optional[Dict] = None, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """ + Build ResNet XNOR basic block. + + Args: + in_planes: the number of in-channels for the block + planes: the number of out-channels for the block + x_quant: quantization scheme for activations, + see :mod:`~quant.binary.binary_conv`. + w_quant: quantization scheme for weights, + see :mod:`~quant.binary.binary_conv`. + nonlins: non-linearities for the block. It should be a list of two + strings, where each string is in {'relu', 'prelu', 'identity'}. + stride: stride size + double_shortcut: whether to use double shortcuts. + clamp: clamping scheme for activations. + It should have a key named "kind" indicating the kind of clamping function + and other keys indicating other potential arguments. + See :mod:`~quant.binary.binary_conv`. + moving_average_mode: moving average mode to use, + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + moving_average_momentum: momentum for moving average update, + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + """ + super(XnorBasicBlock, self).__init__() + if len(nonlins) != 2: + raise ValueError('There should be 2 non-linearities.') + self.double_shortcut = double_shortcut + + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = QuantConv2d( + x_quant, w_quant, in_planes, planes, 3, clamp, + moving_average_mode, moving_average_momentum, stride=stride, padding=1, bias=True + ) + self.nonlin1 = non_linearity_map[nonlins[0]]() + + self.bn2 = nn.BatchNorm2d(planes) + self.conv2 = QuantConv2d( + x_quant, w_quant, planes, planes, 3, clamp, + moving_average_mode, moving_average_momentum, stride=1, padding=1, bias=True + ) + self.nonlin2 = non_linearity_map[nonlins[1]]() + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != planes: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_planes, + planes, + kernel_size=1, + stride=stride, + bias=True, + ), + nn.BatchNorm2d(planes), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of XnorBasicBlock.""" + out1 = self.nonlin1(self.conv1(self.bn1(x))) + if self.double_shortcut: + out1 = out1 + self.shortcut(x) + out2 = self.conv2(self.bn2(out1)) + if self.double_shortcut: + out2 = self.nonlin2(out2) + return out2 + out1 + out2 = out2 + self.shortcut(x) + return self.nonlin2(out2) + + +class QResNet(nn.Module): + """ + ResNet implementation supporting full precision and quantized schemes. + + Note we use full-precision down-sampling. See: + + Zechun Liu, Baoyuan Wu, Wenhan Luo, Xin Yang, Wei Liu, and Kwang-Ting Cheng. + Bi-real net: Enhancing the performance of 1-bit CNNs with improved representational + capability and advanced training algorithm. + In Proceedings of the European conference on computer vision (ECCV), pages 722–737, 2018. + + Two types of blocks can be used, either + :class:`~quant.models.resnet.RegularBasicBlock` (regular) or + :class:`~quant.models.resnet.XnorBasicBlock` (xnor). + + ResNet consists of the following layers: + layer0 (first layer), layer1, layer2, layer3, layer4 (optional), layer5 (last layer). + + `layer0` is the feature extractor layer (conv1). + Its config dictionary contains keys: `n_in_channels`, `kernel_size`, `stride`, `padding`, + `bias`, and `maxpool`. + It is important to note that `n_in_channels` does not refer to the number of channels of the + image (3), but rather the number of input channels to `layer1`. + All arguments except for `maxpool` are passed to PyTorch ``nn.Conv2d``. + `maxpool` is another dictionary with keys `type`, `kernel_size`, `stride`, and `padding`. + If the type is `identity`, there is no pooling. + If the type is `maxpool2d`, then the other keys are passed to construct ``nn.MaxPool2d``. + + `layer1`, `layer2`, `layer3`, `layer4` are all dictionaries used to + configure the corresponding layers. + Usually they can all be the same dictionary. + The keys and values here are used to construct either + :class:`~quant.models.resnet.RegularBasicBlock` or :class:`~quant.models.resnet.XnorBasicBlock` + depending on what is specified in `block`. + + `nonlins` is a list of two strings specifying the non-linearity to use inside each layer. + Each string value can be `relu`, `prelu`, or `identity`. + """ + + def __init__( + self, + loss_fn: Callable[..., torch.Tensor], + block: str, + layer0: dict, + layer1: dict, + layer2: dict, + layer3: dict, + layer4: Optional[dict], + nonlins: List[str], + num_blocks: List[int], + output_classes: int, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> None: + """ + Construct QResNet. + + Args: + loss_fn: loss function of the model + block: name of the block to use ('regular' or 'xnor') + layer0: configuration for conv1 layer of the model + layer1: configuration for layer1 layer of the model + layer2: configuration for layer2 layer of the model + layer3: configuration for layer3 layer of the model + layer4: configuration for layer4 layer of the model + nonlins: non-linearities to use for each layer. It should be a list of two + strings, where each string is in {'relu', 'prelu', 'identity'}. + num_blocks: a list representing the number of blocks in each layer + output_classes: number of output classes + moving_average_mode: moving average mode to use + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + moving_average_momentum: momentum for moving average + update, see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + """ + super(QResNet, self).__init__() + # loss_fn is a loss function in torch.nn.functional + setattr(self, 'loss_fn', loss_fn) + + blocks = { + 'regular': RegularBasicBlock, + 'xnor': XnorBasicBlock, + } + try: + block_cls: Union[RegularBasicBlock, XnorBasicBlock] \ + = blocks[block] # type: ignore + except KeyError: + raise ValueError(f'Block {block} is not supported.') + + n_in_channels = layer0['n_in_channels'] + + self.conv1 = nn.Conv2d( + 3, + n_in_channels, + kernel_size=layer0['kernel_size'], + stride=layer0['stride'], + padding=layer0['padding'], + bias=layer0['bias'], + ) + if layer0['maxpool']['type'] == 'identity': + self.maxpool = nn.Identity() + elif layer0['maxpool']['type'] == 'maxpool2d': # pragma: no cover (coverage does not report it even though it's covered) # noqa: E501 + self.maxpool = nn.MaxPool2d( # type: ignore + kernel_size=layer0['maxpool']['kernel_size'], + stride=layer0['maxpool']['stride'], + padding=layer0['maxpool']['padding'], + ) + else: + raise ValueError( + f"maxpool type {layer0['maxpool']['type']} is not supported." + ) + + self.bn1 = nn.BatchNorm2d(n_in_channels) + + self.blocks = nn.ModuleList( + [nn.Sequential(self.conv1, self.bn1, nn.ReLU(inplace=True), self.maxpool)] + ) + + n_planes = self._make_layer( + block_cls, layer1, + n_in_channels, n_in_channels, num_blocks[0], nonlins, stride=1, + moving_average_mode=moving_average_mode, + moving_average_momentum=moving_average_momentum + ) + n_planes = self._make_layer( + block_cls, layer2, + n_planes, 2 * n_in_channels, num_blocks[1], nonlins, stride=2, + moving_average_mode=moving_average_mode, + moving_average_momentum=moving_average_momentum + ) + n_planes = self._make_layer( + block_cls, layer3, + n_planes, 4 * n_in_channels, num_blocks[2], nonlins, stride=2, + moving_average_mode=moving_average_mode, + moving_average_momentum=moving_average_momentum + ) + if layer4 is not None: # pragma: no cover (coverage does not report it even though it's covered) # noqa: E501 + n_planes = self._make_layer( + block_cls, layer4, + n_planes, 8 * n_in_channels, num_blocks[3], nonlins, stride=2, + moving_average_mode=moving_average_mode, + moving_average_momentum=moving_average_momentum + ) + + self.linear_classifier = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), + nn.Flatten(), # type: ignore + nn.Linear(n_planes, output_classes) + ) + + def _make_layer( + self, + block: Union[RegularBasicBlock, XnorBasicBlock], + layer_config: dict, + in_planes: int, + out_planes: int, + num_blocks: int, + nonlins: List[str], + stride: int, + moving_average_mode: str = 'off', + moving_average_momentum: float = 0.99, + ) -> int: + """ + Make a layer (layer1, layer2, layer3, layer4). + + Args: + block: + block to user in the layer + layer_config: a dictionary containing the config for the layer. + It should have the following keys: + * x_quant: quantization scheme for activations + * w_quant: quantization scheme for weights + * clamp: clamping scheme for activations. + It should have a key named "kind" indicating the kind of clamping function + and other keys indicating other potential arguments. + * other optional keys such as double_shortcut + in_planes: the number of in-channels for the layer + out_planes: the number of out-channels for the layer + num_blocks: the number of blocks for the layer + nonlins: non-linearities for the current layer. It should be a list of two + strings, where each string is in {'relu', 'prelu', 'identity'}. + stride: stride size + moving_average_mode: moving average mode to use + see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + moving_average_momentum: momentum for moving average + update, see :class:`~quant.binary.activation_quantization.ActivationQuantizer`. + + Returns: + the number of planes of the layer + """ + strides = [stride] + [1] * (num_blocks - 1) + for stride in strides: + self.blocks.append( + block(in_planes, out_planes, nonlins=nonlins, stride=stride, + moving_average_mode=moving_average_mode, + moving_average_momentum=moving_average_momentum, **layer_config) + ) + in_planes = out_planes + + return in_planes + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Forward pass of XnorBasicBlock.""" + for block in self.blocks: + x = block(x) + return self.linear_classifier(x) diff --git a/quant/utils/__init__.py b/quant/utils/__init__.py new file mode 100644 index 0000000..605d73a --- /dev/null +++ b/quant/utils/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Utilities to use with Quant.""" diff --git a/quant/utils/checkpoints.py b/quant/utils/checkpoints.py new file mode 100644 index 0000000..da1b2a6 --- /dev/null +++ b/quant/utils/checkpoints.py @@ -0,0 +1,136 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Utilities for working with checkpoints.""" + +from pathlib import Path +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.optim.optimizer import Optimizer + + +def log_checkpoints( + checkpoint_dir: Path, + model: Union[nn.Module, nn.DataParallel], + optimizer: Optimizer, + scheduler: optim.lr_scheduler._LRScheduler, + epoch: int, +) -> None: + """ + Serialize a PyTorch model in the `checkpoint_dir`. + + Args: + checkpoint_dir: the directory to store checkpoints + model: the model to serialize + optimizer: the optimizer to be saved + scheduler: the LR scheduler to be saved + epoch: the epoch number + """ + checkpoint_file = 'checkpoint_{}.pt'.format(epoch) + checkpoint_dir.mkdir(exist_ok=True, parents=True) + file_path = checkpoint_dir / checkpoint_file + + if isinstance(model, nn.DataParallel): + model_state_dict = model.module.state_dict() + else: + model_state_dict = model.state_dict() + + torch.save( # type: ignore + { + 'epoch': epoch, + 'model_state_dict': model_state_dict, + 'optimizer_state_dict': optimizer.state_dict(), + 'scheduler_state_dict': scheduler.state_dict(), + }, + file_path, + ) + + +def restore_from_checkpoint( + model: Union[nn.Module, nn.DataParallel], + optimizer: Optional[Optimizer], + scheduler: Optional[optim.lr_scheduler._LRScheduler], + checkpoint_path: str, + device: torch.device, + strict_keys: bool = True, +) -> Tuple[ + Union[nn.Module, nn.DataParallel], + Optional[Optimizer], + Optional[optim.lr_scheduler._LRScheduler], + int, +]: + """ + Restore model, optimizer, and learning rate scheduler state from checkpoint. + + Args: + model: the model object to be restored + optimizer: the optimizer to be restored + scheduler: the LR scheduler to be restored + checkpoint_path: path to a model checkpoint + device: the device to load data to. Note that + the model could be saved from a different device. + Here we transfer the parameters to the current given device. + So, a model could be trained and saved on GPU, and be loaded on CPU, for example. + strict_keys: If True keys in state_dict should be identical after restoring + + Returns: + the initialized model, optimizer, scheduler, and epoch from the checkpoint + """ + checkpoint = torch.load(checkpoint_path, map_location=device) # type: ignore + if isinstance(model, nn.DataParallel): + model.module.load_state_dict(checkpoint['model_state_dict'], strict=strict_keys) + else: + model.load_state_dict(checkpoint['model_state_dict'], strict=strict_keys) + + if optimizer is not None: + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + + # Transfer parameters internal variable to models device + for state in optimizer.state.values(): # type: ignore + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.to(device) + + if scheduler is not None: + scheduler.load_state_dict(checkpoint['scheduler_state_dict']) + + loaded_epoch = checkpoint['epoch'] + + return model, optimizer, scheduler, loaded_epoch + + +def get_path_to_checkpoint(experiment_path: Path, epoch: Optional[int] = None) -> str: + """ + Find checkpoint file path in an experiment directory. + + Assume that checkpoint file names follow the `checkpoint_{epoch}.pt` format. + + Args: + experiment_path: path to an experiment directory + epoch: If given tries to load that checkpoint, otherwise + loads the last checkpoint + + Returns: + Path to checkpoint file + """ + ckpts_path = experiment_path / 'checkpoints' + ckpts_dict = { + int(path.name.split('_')[1].split('.')[0]): path + for path in ckpts_path.iterdir() + } + if len(ckpts_dict) == 0: + raise ValueError( + f'No checkpoint exists in the experiment directory: {experiment_path}' + ) + if epoch is not None: + if epoch not in ckpts_dict.keys(): + raise ValueError(f'Could not find checkpoint for epoch {epoch}.') + else: + epoch = max(ckpts_dict.keys()) + + return str(ckpts_dict[epoch]) diff --git a/quant/utils/kd_criterion.py b/quant/utils/kd_criterion.py new file mode 100644 index 0000000..fa0a5ec --- /dev/null +++ b/quant/utils/kd_criterion.py @@ -0,0 +1,52 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Criterion for knowledge distillation.""" + +import torch +import torch.nn.functional as F + + +def kd_criterion( + output_student: torch.Tensor, + output_teacher: torch.Tensor, + target: torch.Tensor, + temperature: float, + freeze_teacher: bool = True, + teacher_correction: bool = True, +) -> torch.Tensor: + """ + Criterion for knowledge distillation. + + Args: + output_student: student network output + output_teacher: teacher network output + target: target tensor + temperature: temperature + freeze_teacher: whether to freeze teacher + teacher_correction: whether to use the regular loss when the teacher's prediction + is different from the true label for that particular example + + Returns: + loss based on knowledge distillation criterion + """ + output_teacher_val = output_teacher.detach() if freeze_teacher else output_teacher + + kd_loss = F.kl_div( + F.log_softmax(output_student / temperature, dim=1), + F.softmax(output_teacher_val / temperature, dim=1), + reduction='none' + ) * (temperature * temperature) + kd_loss = kd_loss.sum(dim=1) + + if teacher_correction: + pred_teacher = output_teacher_val.argmax(dim=1) + correct_mask = pred_teacher.eq(pred_teacher) + ce_loss = F.cross_entropy(output_student, target, reduction='none') + total_loss = correct_mask * kd_loss + ~correct_mask * ce_loss + else: + total_loss = kd_loss + + return total_loss.mean() diff --git a/quant/utils/linear_lr_scheduler.py b/quant/utils/linear_lr_scheduler.py new file mode 100644 index 0000000..7de36df --- /dev/null +++ b/quant/utils/linear_lr_scheduler.py @@ -0,0 +1,54 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Linear PyTorch learning rate scheduler.""" + +from typing import List + +from torch.optim import Optimizer # type: ignore +from torch.optim.lr_scheduler import _LRScheduler + + +class LinearLR(_LRScheduler): + """Decays the learning rate following a linear schedule.""" + + def __init__(self, optimizer: Optimizer, min_lr: float, total_epochs: int, + steps_per_epoch: int, last_epoch: int = -1) -> None: + """ + Construct a linear lr scheduler specifying the minimum lr and last epoch. + + Args: + optimizer: Wrapped optimizer. + min_lr: Minimum learning rate. + total_epochs: Total number of epochs. + steps_per_epoch: The number of steps (batches) per epoch. + last_epoch: The index of the last batch. + This parameter is used when resuming a training job. + Since step() should be invoked after each batch instead of after each epoch, + this number represents the total number of batches computed, + not the total number of epochs computed. + When last_epoch=-1, the schedule is started from the beginning. Default: -1 + The batch size should be consistent between the resuming job and the prior job, + or else the scheduler can be wrong. + + """ + self.min_lr = min_lr + self.total_epochs = total_epochs + self.steps_per_epoch = steps_per_epoch + super(LinearLR, self).__init__(optimizer, last_epoch) + + def get_lr(self) -> List[float]: # type: ignore + """Get the current learning rate for each parameter group.""" + lrs = [] + total_steps = (self.total_epochs - 1) * self.steps_per_epoch + last_epoch = self.last_epoch # type: ignore + for group in self.optimizer.param_groups: # type: ignore + lr_0 = group['initial_lr'] + lr = max( + lr_0 - last_epoch / total_steps * (lr_0 + self.min_lr), self.min_lr + ) + lrs.append(lr) + + return lrs diff --git a/quant/utils/moving_average.py b/quant/utils/moving_average.py new file mode 100644 index 0000000..9b696e8 --- /dev/null +++ b/quant/utils/moving_average.py @@ -0,0 +1,39 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Exponential moving average layer.""" + +import torch +import torch.nn as nn + + +class MovingAverage(nn.Module): + """Exponential moving average.""" + + def __init__(self, momentum: torch.Tensor) -> None: + """ + Construct moving average layer. + + Args: + momentum: A vector indicating the momentum to use for the corresponding row. + """ + super(MovingAverage, self).__init__() + self.register_buffer('num_batches_tracked', torch.tensor(0)) + self.register_buffer('momentum', momentum) + self.register_buffer('moving_average', torch.tensor([0.0] * len(momentum))) + + def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore + """Return the current moving average, given a vector x.""" + if self.training: + with torch.no_grad(): + if self.num_batches_tracked.item() > 0: # type: ignore + old = self.momentum * self.moving_average # type: ignore + new = (torch.ones_like(self.momentum) - self.momentum) * x # type: ignore + self.moving_average.copy_(old + new) # type: ignore + else: + self.moving_average.copy_(x) # type: ignore + self.num_batches_tracked += 1 # type: ignore + + return self.moving_average # type: ignore diff --git a/quant/utils/utils.py b/quant/utils/utils.py new file mode 100644 index 0000000..6a9b614 --- /dev/null +++ b/quant/utils/utils.py @@ -0,0 +1,13 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Common utility functions.""" + +from typing import Any + + +def noop(*args: Any, **kwargs: Any) -> None: + """No-op that returns None.""" + return None diff --git a/quant/utils/visualization.py b/quant/utils/visualization.py new file mode 100644 index 0000000..4d0e2e2 --- /dev/null +++ b/quant/utils/visualization.py @@ -0,0 +1,116 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +""" +Utilities for supporting visualization with TensorBoard. + +Quant supports visualizing loss and evaluation metrics during training in TensorBoard. +""" + +from typing import List, Tuple +from functools import partial +from pathlib import Path +import shutil +from typing import Any, Dict, Optional + +from torch.utils.tensorboard import SummaryWriter + +from quant.common.metrics import Metric + + +class Visualizer: + """TensorBoard visualizer.""" + + def __init__( + self, + tensorboard_base_dir: Path, + root_experiments_dir: Path, + experiment_name: str, + ) -> None: + """ + Create a visualizer object for TensorBoard. + + Args: + tensorboard_base_dir: Root directory where TensorBoard experiments are stored + root_experiments_dir: Root directory for storing logs, checkpoints, etc. + experiment_name: Name of the experiment + """ + self.tensorboard_base_dir = tensorboard_base_dir + self.root_experiments_dir = root_experiments_dir + self.experiment_name = experiment_name + self.writer = SummaryWriter(str(tensorboard_base_dir / experiment_name)) # type: ignore + + def hook(self, split: str, metrics: Dict[str, Metric], + epoch: int, global_step: int, log_interval: int = 10, + values_dict: Optional[Dict[str, float]] = None, **kwargs: Any) -> None: + """ + Provide a training / test loop-compatible hook for logging evaluation metrics. + + Args: + split: The split to visualize, e.g. train or test + metrics: Dictionary mapping metric names to Metric objects + epoch: Training epoch + global_step: Unique incrementing integer across all epochs indicating the step + log_interval: frequency for logging metrics + values_dict: Dictionary mapping names to values + for other non-metric values to log + """ + if values_dict is None: + values_dict = {} + + if split != 'train': + for name, metric in metrics.items(): + name = name.replace(' ', '_') + self.writer.add_scalar(f'{name}/{split}', metric.compute(), epoch) + + for name, val in values_dict.items(): + name = name.replace(' ', '_') + self.writer.add_scalar(f'{name}/{split}', val, epoch) + + elif global_step % log_interval == 0: + for name, metric in metrics.items(): + name = name.replace(' ', '_') + self.writer.add_scalar(f'{name}/{split}', metric.compute(), global_step) + + for name, val in values_dict.items(): + name = name.replace(' ', '_') + self.writer.add_scalar(f'{name}/{split}', val, global_step) + + def __del__(self) -> None: + """Make a copy of the summary writer logs in the experiment artifacts.""" + shutil.copytree( + self.tensorboard_base_dir / self.experiment_name, + self.root_experiments_dir / self.experiment_name / 'tensorboard', + ) + + +def get_tensorboard_hooks( + config: dict, experiment_root_directory: Path, + train_metrics: Dict[str, Metric], test_metrics: Dict[str, Metric] +) -> Tuple[List, List]: + """ + Get TensorBoard hooks for visualizing metrics as training progresses. + + Args: + config: experiment config + experiment_root_directory: root directory for storing logs, checkpoints, etc. + train_metrics: dict mapping metric keys to metric objects for training + test_metrics: dict mapping metric keys to metric objects for testing + """ + log_config = config['log'] + + train_hooks = [] + test_hooks = [] + + if log_config['tensorboard']: + visualizer = Visualizer( + Path(log_config['tensorboard_root']), + Path(experiment_root_directory), + config['experiment_name'], + ) + train_hooks.append(partial(visualizer.hook, split='train', metrics=train_metrics)) + test_hooks.append(partial(visualizer.hook, split='test', metrics=test_metrics)) + + return train_hooks, test_hooks diff --git a/quant_logo.png b/quant_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..78a7aea6b318d0720e8d85cad18c8390ee0a34c1 GIT binary patch literal 11865 zcmeHtc{tSF`~OtW^Q2Vji4>vuB4Us=vZO^>B5U?acE&bl%HBdDBx`onpoHv=v6MI_{sJxBt1|ZBXP4(Ca~}jkcNG3@^KG3=Kp>7GE}uVZd^>HL z%JJ65KE3DXxYC=@$U}$Z+D$LfTS#I3%72T}whR52rN!PJm!}EEYvJJu#SLg-Nu;c@ z@-BNqYu1}@iJyy`{Yv*+vFrV)DETfY98UPd{FI2S!tt)-isAI=8<+2gx?(B4?Wdbo zzY2@aX09%&jxPnN$sYO0sKw8oRhLTW{U#N?|ZHI$Zx?bR1S|EITg2Wo^I?%WL$0bM^%laq(Isx=0kdGYzC zQu>+HcXe|678$*|90?Q4mfnGi;6En&yi za4SRz|KY<*v4{3$DiL>b;O_4{l%DdZ8fL_UTw9}_-a2P(2<46JFxa?W>tj5z><9xH z_J>O~s6)5i+`N=;e=kAP-k+X$xN4gcJ=)6sGM;`S$lkmHFGC=dE+9jA{ssZVW!B6@ z*3S2qEq`tAUcvhx%B=~mIL4>^B)1?YNS#k@t=<@Q-S=mHaXdb`mulm;QzPmT0G1;5 ze5p^^pIokESQY55y_4h3CH32=W2!Z)J;sR&ib`p}yN6U?ci)aIx|#N6r>b)BTg2vspsk~OdHupjxqwx)Wvf)kPDDnLps9v}6$H`2+nDuU3 zJ=bn($Bck0(l#S{^D9IT4uw|DOzoG{7^LJ5{ueV!k6$JqYmkOHkBN0?K0spS(s z_*A{ug*tREGt8+sca9B*on}p{&oHi~)WRaW+cJw{+7k zzd2fhsvS9_BDJGk5AZEr5TW-fXuS5^L@NPBA0yh3*r+(18A9YN>=Z%)nwiW-Nov4^ zAUYF7xZVU;$S$b5IoUO7`dOn-)!_cB_r_p>ARLT;rqR2Dg3nfMFsG_k4rjCLhC|d1 zEqnoL)YVQi0`-YDZF?MC`N>?L!;CWt;}^2on@<3m-@kDVqxbnL`Z8iK(2!D5+sjx? zm|UR2;PUnVM0(_dz8en7TR89a$$&eXaz@LHw%r=W)=#IdRZ6 z<0$j>7buK%?ep6gE9G`eN3$A5JgZ4NOC>1AJbyYtdE@#&G?(@{(zX|X#e@)l!A8|z zb4Uye8^cz;)WcZ))e7^Lftb(EOnuwc^*_^*uX-N25Tf!zWTDaGxop&;n11p^$yo2B z-AXLIPYbO}L<2YImMYd3#Roc>30F$YGBZ^_H*!YL!arSZ8NW z+DYhIi`#=y`nly(s7dszM9FiRFbX;vW-}lF0rSbgDU#O5+#dP6}hoh(~IVb zdjTL4#93bEdD}LZnke;l7uSLSTrTdu#guuMRD-j@&@&yT#cnIn`nJ@G`Ia$F0ajDr zDk?y^;2Qo7RZ^k}1e7x;+hwTMKVJ=FlWQrR${q+4fr0;L5o7FtuMdHe?5M!aybr)0 zd%&^Qpszv=>$RfofUx$DHJ?tbo%?J_p+@}MCOzOk%t(*sJeA7+0eYzZL^LOJS(L&r zc1wZ#9^~f1O)y_7oR6~zJf_mI z>C_+4fowB4=1LRxep$ zpHn4mp4dMlg&t!;<>ED9(dTYavga&}S!VsyDJc{-RRHaP`~){;8vAVZ{Iy(48S&H~ zThBgYN2khe_4?rVhk#kLJkZxlB)pHqY^pIxdwUDtW8eGKKJPT^`R%PEKOjvD36-|r zFij=TthPFA?~Etd&7hWcYwbE)1CcYo)(-_bJ2?drEEQ7O*%2%c;SKDdziLZ8m(LR{e_8$8-OLwI+wOyp)E3f^WSK(tFZ9|vCQq?6v+IgpD$=Yj9vFqNupV@3x0ECq7Xo`TlF>%KE6 zsvEN73Cq~^8Xcqyc1n=jAm*oI65>ts1 z`H%F>TdV&FNAM+5no61hS4SXMwwC37cL)@$DYD}+P!-a3CW&1hI-2Ghec;|T2oZ6E zWtYXnQtzb1iy_)9X3O_Ej&5fOBG}FW6dJAy8TV3HxvV_&<7yvuqngZS;0!E6gh>D2 zSrGLwVIdyC+_*Imu|EvxMI74$&>%Dp{~iFs@;6JYnEIa|8sjhHk*>b7G3&Xa?HTGd zFX7R@DQ?hx1;05;Q8Cii(HYi|;zOxlM%uQ&%>FYqz8}Md)}qXTfza8InOMNna;Ssu4~5kgv4!VKIH{yx0@KH~oWbFRO7 zp5A1t$QV~YeHEP85FxMq&Si|RT3Az}`3O38;GksC+b9_Yel_&q`xc_!=^~w2fTGc8 zqc|DxocG*eq#^*h)LHDaB|ZvYa60IPr z9?Jh`cr>p*=+Ll6r!@sSF@BK`18L)LuNqqz)ib*Eo}LR^aw$Kx*!Y}%-L)=`CpG|) ztgo>Y?}wu=UPK=N$CxZ;gBHEj)XaanaWJ0M?NXJ}q`FaRyY&Qx8Dc6Dz(Rwab&vb6 zs7MI#)l_dmn-7yMv^UFlRk{6MMiXkMJOkni-D5`Z!MZz^;4!PeWt_0%MHJEQ0cN@< zJzAiyOl}{LF!@;p!75#(T>^8U=bmnvaLSw4qIHa4`AHHBkGKdYK7$CT$@&WZ`|N{n zWj(ysj-^cCt;R=z5e6o>M)w{Y9_OC&VcgavXY(7~@we_!x;KdZacC1JX%%p~`G9 z?MEU1e;TOg>6e^e4$Td5lb0~o);v3b-)(4z@pm$2b;bC^YDjZmg3;iJ4o26IT69r2 zjGSL)_k>J-zk31a1sapxsenykVyHUR7v&6)A{;EuVkt_pSf7c|uvPlHgL2RsnXCjl znsZQ&-bMg+s5-OF<75=T`QB|(8E`QO?*2{uva^$0_;`I#@CCj`HuhP6eFM2sX)thY zp2cHx;;sv*+~Ehc8~*;yV}NRs?7{RVy%`4|4uPN#Zg7;U>=2+P;3d_K?il16nVtw# z(88OVM{+Qw_@*n6y@ke?f|c!VsN(kdZ&!&>AlW(&Oq1^5Bf!66$%@nD%W={D4Q})e zGW%W*_koOGYD@Gj3*uijj~wbsy4bMN(7jO~3j?h=MeG7?L9RLfyv<(63FbsXO{gFVH8UU&wA8NuD}G7^Z7B1KT+On3K`^}_8fmH_eF>o?VjW49Z8}-Gpr&2ET32K z9BQ5>&PPF)$jt^J#q@pDsPv4s@&jC;#Yp~SEX6>=Qq7QuZ+zI5|3DRJ)a`q7f&OmX z#}aiuVfq~ixc8jRW4fpVRjY>DQ#w+Qji%qNg$H|GqJapJzER49pFq>1(S47=EjVa6 zQp%5?D38ihNU}Mcd!(wB1Q)ZVYIr|d8QqQU^TM-H2RGI%Lt!*oPneG=aPnTAk&22g zb)Pz@32%l97v#KoRgh9MBIsFW$4bUGI6;9;+ekDAwL*&>6tgG}#$a$+C!wjA}pv_g9$@*XA_q2*d!a!mQ`mW;@u7|(JNAG6KX4`CdsZ~*g}!^1yqU#?T}jS zSnR_09cBqI9%%Xrk^Ku6G~{I2l=o1rf8DcMChjDk49q6-EYiNAqPfM-cy7MPZ6%i< z2jpo4=ev#M>`Hr)%HufE_XLzlfkI_(BOx^7j8r1h16gxBY%I(;X6=JKt{W(RR5Tw; zi637{i6>KW+^s({_F9r1XRV&V47nx<1a5Q!xDzC5vLVw+QALWInAE+-$W(Fw;hh!J zArAZaCVoB2zPaM<)%=8-1Yk4I%|Nq1k*;eyn@xN!J%O>^AgQn0D&T-C*a{P4Q*9_u zUMGKB*9M(VC~cGGNk3CS^06F{y^J!(!D;bim3eI%4=Bl$R_Uvub~*qLIJpQwpQ7}| zBquQ|-ZJ@wAnF8PG6(-PV?RF7@3;E^%jXJ-Qdr++1DEOd)${8V@_h~~`(3ht7{m?1 z%2!^^ByicWsv}k5Iao8Os}tmo?c3;)$J_vkF0a@#HkF4UHGZeo2~>kZ+t#`gjR?|_ z)~v~o;Pv93bxLM*a}fY5i=4{d?Ng4vo4%DQ(xS!eU<1$@5~>tyW9W(DKE#v9$dT@k zW2jmi-+6eA0VT%Q2G66E`Km-2XKdpPnTx0xUKjD3OMvYw?gopUQq~hRuw+AaPl7$; zD1T~!T8G=gDLc=~#`I{vyg&KMA@T6p#4lKAjfACB^R5xY7 zb6Dxy1_w-J!h%bCT6*07m`!Pz6+KFrc>G=R%{9D4&FW-i`@NxJSh1K`2U>Ka8r@&q zc2LC0J+<$hZ#ft`_DQBYLl-Z?qyygW0W88L9@z7{B7g8{LPGnyIW6eDd) zsRZxQ%J`LguY#x#6+D8e>fEw545`)5g~ZC*A>(D6wZtTH+UYOa!C*fRDLsfk_+)&= zNyxifG4T|)Y_&jIVUca~LWZ8q@M@0Y`nD^F+rWndoA6rz?G%;e2K0c2w$IKlS89k) z9l`kSgZMZ(lo;c>5GImVb5lfZs`<#EEElg;&3F~I8in@USaH6dwP5N(|Dw*kK;&I! zWTsuB2oQ3>eHwWvF>_{|>vhgzn`@#yug-k~^%qgLp@C5~k2Jvr+R|*B^C{jBAXKFF zFj0|%=~sW4jkG@T`anP{C&>Xeo3qxVzEgX9Qu8q9>9AZg@s1`>1o9mpDkF27 zK?> z4rcI@f2l;mbwxdb1AanqmdeLhbEalFDOiP%S9RPNpIJ7!=h;ljOM`wi z(YW4fmo9}ISX*5HWv7FT8f!Cs5zLaJgRkSl`ThTlkv^@j7Np1gnj}*8_)bHTR3zBq zm>G?sO(zp&Jc6g^(QRRp2UmCm;L$@)dGEhaJGiv=Q#T1xdbf5l#?G{L$6w54)k2oe zBXwggH&0&ir`2Z5fgnf3NrMUs6&C(K#sn#W0^;l#`pMNSijNBOhXN?Rz>j0c_qV?PQCl3c!VJjvx$0LM zD+cvBAAFryx&>JKne6zrD)*Mo8RVvi^|CuJz*9xS!T|3TOw}8*TOmv2X+(OW*&C}aI+;cJqSwcrpm0`TcFSBrDM$nI zYhWHZ=$qG$lN8wG*a8R=Z){f~;T3h1^*T2>oIcA;z;svLT2p6nFJ938bX0;EbQ;uG z!KRZ}cC%^zbtjQx{p0zV2TM(Y;@U3;cqae*TzUQW8>;ZeJbugIQE++E{p;^i36=6& z<}48GCry#A*k|`)-56oO9Xvi;i>D@}G(n=(#`e_~Q_KH(K=Q2x&lH-ldDvkV%-9pO9-ogV~o>$*66euonMM>6+Kh&ckh8 z(BL%Wxlh7OO~LPq^PW5;TsLCrZi}jic~9=$#s$9)IEMQMrufEf^Uo9^n}Y*24;oBD zy%6f$R-Z$kV~#L2rgs#34GUcoKxbz~T3Ee%EGS456hv}9;QVyqts!@LDv~Es($HB* z)BS@+>TW^t3PG`ZauaqY1{6Iio7R=FuAG5^)Mw~ZKB6n@*N+8YMdJGRH>!`T41({= z=#_DGoGu!HyMQ?c#iCe0X283k^;%87JkjIe>jZnLhU8x(HYW%i<2n2i9U*>y1I$_3 zW%j)f>}hUh-ttkDD+GbD`s#Q(Zmf2|em0IU1gXku9Zbz9`Z9xxvQ zrEG$c3flOP*-Bzx%h`^2uQzPJ^rLZTmR&w8=mFT6-n}ZbKb@Ym+8z_bHK|d<23d}j z$#|tKWkx*JkvRxHBzb8j98R2+ff7RQH&H+eEU;zE@TI3PAoTp+%CcW3|jP9(RBhAPxGr!LcwZ{qJ$ zFs8|(#M2)qN&Mh~>_~h9d{=qEE^XKG$6Q5nFFX4We))^tH;Vv4pHMXP6MiuF^WorW zMQ)dikcPh*1!GtAGdXsC;TP|?OFsZ<>$}wXa@6|8!+QLyyPABoeGr%UNIpK{T;8vM zg@+aPdAnfK*a17;d_4EpjD#?Qnbli*+56BLPuquJ!(`A%0Dl)scO8c*Eg7ofMh$5pYs3pQQ@vbh__wAEm5?{Zgv+ahV%@touIw zAmmRkWp7oJibwDvwZpx6fk*mup-Kf8VOmZp+Guq?+|JZD+nO~&=^2`u^59R+;~uvB zK{ds%{kCULPu1Z{T`Vh>%M>6zQ zGv8Df+kHOu^sz6Zj+Er|B6bD~{yDN5^{}1xYZc4DFC7hOt=)mc2^*es?Pt%u&iZ6s zDFxmwB83~2p4ywIyiem#wY-xG;#FGa+wyHnh29NMqu^_Vq0F~IA7`_RX4~o+H}I^W zVn^-4eJzog>DDV~K54lwKK9qZ$O>QEgNKR(0!*`px%UxuADMd{cNg8oAtPm&$9Et! zlI+bDj%ps>b-cg5wdmI3r2Nn;y_n{pYL^yjc2UtBCrxbdXr0_DN5KgHG`8NmPoEtVm=0.23.0 +pyaml==19.4.1 +tensorboard==2.0.0 +torch==1.4.0 +torchvision==0.5.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..2bf1d45 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""This package contains all tests for quant.""" diff --git a/tests/binary/__init__.py b/tests/binary/__init__.py new file mode 100644 index 0000000..0638005 --- /dev/null +++ b/tests/binary/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""This package contains tests for Quant binary modules.""" diff --git a/tests/binary/test_activation_quantization.py b/tests/binary/test_activation_quantization.py new file mode 100644 index 0000000..8073057 --- /dev/null +++ b/tests/binary/test_activation_quantization.py @@ -0,0 +1,258 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test activation quantization classes.""" + +import torch + +import quant.binary.quantization as quantization +from quant.binary.activation_quantization import ActivationQuantizerLS1,\ + ActivationQuantizerLS2, ActivationQuantizerLST, ActivationQuantizerGF + + +def test_activation_quantizer_ls1_no_ma(): + """Test no moving average mode of activation quantizer for least squares 1 bit.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + + quantizer_ls1_no_ma = ActivationQuantizerLS1('off') + quantizer_ls1_no_ma.train() + quantizer_ls1_no_ma(x) # v1 should be 2 for all examples + x_q_train_no_ma = quantizer_ls1_no_ma(x) # call twice so moving avg changes if used + assert torch.all(x_q_train_no_ma == 2.0) + + quantizer_ls1_no_ma.eval() + x_q_eval_no_ma = quantizer_ls1_no_ma(x2) + # v1 should not be cached, so it should be recomputed + _, expected = quantization.quantizer_ls_1(x2) + assert torch.all(x_q_eval_no_ma.eq(expected)) + assert not torch.all(x_q_eval_no_ma.eq(x_q_train_no_ma)) + + +def test_activation_quantizer_ls1_eval_only(): + """Test eval_only mode of activation quantizer for least squares 1 bit.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + x3 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_ls1_eval_only = ActivationQuantizerLS1('eval_only', 0.9) + quantizer_ls1_eval_only.train() + x_q_train_eval_only = quantizer_ls1_eval_only(x) + assert torch.all(x_q_train_eval_only == 2.0) + x_q_train_eval_only = quantizer_ls1_eval_only(x3) + assert torch.all(x_q_train_eval_only == 4.0) + + quantizer_ls1_eval_only.eval() + x_q_eval_eval_only = quantizer_ls1_eval_only(x2) + # moving average should cause v1 to become 2 * 0.9 + 4 * 0.1 = 2.2 + assert torch.all(x_q_eval_eval_only == 2.2) + + +def test_activation_quantizer_ls1_train_and_eval(): + """Test train_and_eval mode of activation quantizer for least squares 1 bit.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + x3 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_ls1_all_ma = ActivationQuantizerLS1('train_and_eval', 0.9) + + quantizer_ls1_all_ma.train() + x_q_train_all_ma = quantizer_ls1_all_ma(x) + assert torch.all(x_q_train_all_ma == 2.0) + x_q_train_all_ma = quantizer_ls1_all_ma(x3) + # 2 * 0.9 + 4 * 0.1 = 2.2 + assert torch.all(x_q_train_all_ma == 2.2) + + quantizer_ls1_all_ma.eval() + x_q_eval_all_ma = quantizer_ls1_all_ma(x2) + assert torch.all(x_q_eval_all_ma == 2.2) + + +def test_activation_quantizer_ls2_no_ma(): + """Test no moving average mode of activation quantizer for least squares 2 bits.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + + quantizer_ls2_no_ma = ActivationQuantizerLS2('off') + quantizer_ls2_no_ma.train() + quantizer_ls2_no_ma(x) # v1 should be 2 for all examples + x_q_train_no_ma = quantizer_ls2_no_ma(x) # call twice so moving avg changes if used + assert torch.all(x_q_train_no_ma == 2.0) + + quantizer_ls2_no_ma.eval() + x_q_eval_no_ma = quantizer_ls2_no_ma(x2) + # v1, v2 should not be cached, so it should be recomputed + _, _, expected = quantization.quantizer_ls_2(x2) + assert torch.all(x_q_eval_no_ma.eq(expected)) + assert not torch.all(x_q_eval_no_ma.eq(x_q_train_no_ma)) + + +def test_activation_quantizer_ls2_eval_only(): + """Test eval_only mode of activation quantizer for least squares 2 bits.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + x3 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_ls2_eval_only = ActivationQuantizerLS2('eval_only', 0.9) + quantizer_ls2_eval_only.train() + x_q_train_eval_only = quantizer_ls2_eval_only(x) + assert torch.all(x_q_train_eval_only == 2.0) + x_q_train_eval_only = quantizer_ls2_eval_only(x3) + assert torch.all(x_q_train_eval_only == 4.0) + + quantizer_ls2_eval_only.eval() + x_q_eval_eval_only = quantizer_ls2_eval_only(x2) + # moving average should cause v1 to become 2 * 0.9 + 4 * 0.1 = 2.2, v2 should be 0 + torch.all(x_q_eval_eval_only == 2.2) + + +def test_activation_quantizer_ls2_train_and_eval(): + """Test train_and_eval mode of activation quantizer for least squares 2 bits.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + x3 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_ls2_all_ma = ActivationQuantizerLS2('train_and_eval', 0.9) + + quantizer_ls2_all_ma.train() + x_q_train_all_ma = quantizer_ls2_all_ma(x) + assert torch.all(x_q_train_all_ma == 2.0) + x_q_train_all_ma = quantizer_ls2_all_ma(x3) + # v1 = 2 * 0.9 + 4 * 0.1 = 2.2, v2 should be 0 + assert torch.all(x_q_train_all_ma == 2.2) + + quantizer_ls2_all_ma.eval() + x_q_eval_all_ma = quantizer_ls2_all_ma(x2) + assert torch.all(x_q_eval_all_ma == 2.2) + + +def test_activation_quantizer_lsT_no_ma(): + """Test no moving average mode of activation quantizer for least squares ternary.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + + quantizer_lsT_no_ma = ActivationQuantizerLST('off') + quantizer_lsT_no_ma.train() + quantizer_lsT_no_ma(x) + x_q_train_no_ma = quantizer_lsT_no_ma(x) # call twice so moving avg changes if used + assert torch.all(x_q_train_no_ma == 2.0) + + quantizer_lsT_no_ma.eval() + x_q_eval_no_ma = quantizer_lsT_no_ma(x2) + # v1 should not be cached, so it should be recomputed + _, expected = quantization.quantizer_ls_ternary(x2) + assert torch.all(x_q_eval_no_ma.eq(expected)) + + +def test_activation_quantizer_lsT_eval_only(): + """Test eval_only mode of activation quantizer for least squares ternary.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + x3 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_lsT_eval_only = ActivationQuantizerLST('eval_only', 0.9) + quantizer_lsT_eval_only.train() + # moving average should cause tracked v1 to become 1.0 after call + x_q_train_eval_only = quantizer_lsT_eval_only(x) + assert torch.all(x_q_train_eval_only == 2.0) + # moving average should cause tracked v1 to become 1 * 0.9 + 2 * 0.1 = 1.1 after call + x_q_train_eval_only = quantizer_lsT_eval_only(x3) + assert torch.all(x_q_train_eval_only == 4.0) + + quantizer_lsT_eval_only.eval() + x_q_eval_eval_only = quantizer_lsT_eval_only(x2) + _, expected = quantization.quantizer_ls_ternary(x2, torch.tensor([1.1] * 32)) + assert torch.all(x_q_eval_eval_only.eq(expected)) + + +def test_activation_quantizer_lsT_train_and_eval(): + """Test train_and_eval mode of activation quantizer for least squares ternary.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + x3 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_lsT_all_ma = ActivationQuantizerLST('train_and_eval', 0.9) + + quantizer_lsT_all_ma.train() + # moving average should cause tracked v1 to become 1.0 after call + x_q_train_all_ma = quantizer_lsT_all_ma(x) + _, expected = quantization.quantizer_ls_ternary(x, torch.tensor([1.0] * 32)) + assert torch.all(x_q_train_all_ma.eq(expected)) + # moving average should cause tracked v1 to become 1 * 0.9 + 2 * 0.1 = 1.1 after call + x_q_train_all_ma = quantizer_lsT_all_ma(x3) + _, expected = quantization.quantizer_ls_ternary(x, torch.tensor([1.1] * 32)) + assert torch.all(x_q_train_all_ma.eq(expected)) + + quantizer_lsT_all_ma.eval() + x_q_eval_train_and_eval = quantizer_lsT_all_ma(x2) + _, expected = quantization.quantizer_ls_ternary(x2, torch.tensor([1.1] * 32)) + assert torch.all(x_q_eval_train_and_eval.eq(expected)) + + +def test_activation_quantizer_gf_no_ma(): + """Test no moving average mode of activation quantizer for greedy foldable.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_gf_no_ma = ActivationQuantizerGF(2, 'off') + quantizer_gf_no_ma.train() + quantizer_gf_no_ma(x) + x_q_train_no_ma = quantizer_gf_no_ma(x) # call twice so moving avg changes if used + assert torch.all(x_q_train_no_ma == 2.0) + + quantizer_gf_no_ma.eval() + x_q_eval_no_ma = quantizer_gf_no_ma(x2) + assert torch.all(x_q_eval_no_ma == 4.0) + + +def test_activation_quantizer_gf_eval_only(): + """Test eval_only mode of activation quantizer for greedy foldable.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + x3 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_gf_eval_only = ActivationQuantizerGF(2, 'eval_only', 0.9) + quantizer_gf_eval_only.train() + x_q_train_eval_only = quantizer_gf_eval_only(x) + assert torch.all(x_q_train_eval_only == 2.0) + x_q_train_eval_only = quantizer_gf_eval_only(x3) + assert torch.all(x_q_train_eval_only == 4.0) + + quantizer_gf_eval_only.eval() + x_q_eval_eval_only = quantizer_gf_eval_only(x2) + # moving average should cause v1 to become 2 * 0.9 + 4 * 0.1 = 2.2, v2 should be 0 + torch.all(x_q_eval_eval_only == 2.2) + + +def test_activation_quantizer_gf_train_and_eval(): + """Test train_and_eval mode of activation quantizer for least squares greedy foldable.""" + torch.manual_seed(1234) + x = torch.ones(32, 16, 3, 3) * 2 + x2 = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + x3 = torch.ones(32, 16, 3, 3) * 4 + + quantizer_gf_all_ma = ActivationQuantizerGF(2, 'train_and_eval', 0.9) + + quantizer_gf_all_ma.train() + x_q_train_all_ma = quantizer_gf_all_ma(x) + assert torch.all(x_q_train_all_ma == 2.0) + x_q_train_all_ma = quantizer_gf_all_ma(x3) + # v1 = 2 * 0.9 + 4 * 0.1 = 2.2, v2 should be 0 + assert torch.all(x_q_train_all_ma == 2.2) + + quantizer_gf_all_ma.eval() + x_q_eval_all_ma = quantizer_gf_all_ma(x2) + assert torch.all(x_q_eval_all_ma == 2.2) diff --git a/tests/binary/test_binary_conv.py b/tests/binary/test_binary_conv.py new file mode 100644 index 0000000..6fe35b9 --- /dev/null +++ b/tests/binary/test_binary_conv.py @@ -0,0 +1,107 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test binary convolution.""" + +import itertools + +import pytest +import torch +import torch.nn as nn + + +from quant.binary.binary_conv import QuantConv2d + + +def test_fp_quant_conv2d_eq_nn_conv2d(): + """Test full precision QuantConv2d equals to regular Conv2d.""" + torch.manual_seed(1234) + x = torch.randn(8, 3, 100, 100, requires_grad=True) + x_copy = x.clone().detach().requires_grad_(True) + + nn_conv2d = nn.Conv2d(3, 30, 5) + expected_out = nn_conv2d(x) + expected_loss = expected_out.sum() + + scaled_conv2d = QuantConv2d('fp', 'fp', 3, 30, 5) + scaled_conv2d.weight = nn.Parameter(nn_conv2d.weight, requires_grad=True) + scaled_conv2d.bias = nn.Parameter(nn_conv2d.bias, requires_grad=True) + actual_out = scaled_conv2d(x_copy) + actual_loss = actual_out.sum() + + expected_loss.backward() + actual_loss.backward() + + assert torch.all(expected_out.eq(actual_out)) + assert torch.all(x.grad.eq(x_copy.grad)) + + +def test_ls1_quant_conv2d_sanity(): + """Sanity check for least squares 1-bit scaled binary Conv2d.""" + torch.manual_seed(1234) + x = torch.randn(4, 3, 8, 8) + conv2d = QuantConv2d('ls-1', 'ls-1', 3, 16, (2, 2)) + y = conv2d(x) + # the absolute value of each element in the input x and weight should be at most 1 + # this is a quick sanity check for each of the 16 filters + for i in range(4): + for j in range(16): + assert torch.max(y[i, j].abs()) <= 2 * 2 * 3 + conv2d.bias[j] + + +def test_w_ls1_x_fp_quant_conv2d(): + """Basic test for ls-1 weight, fp activation (input).""" + x = torch.zeros(1, 3, 8, 8) + x.data[0, :, :4, 4:] = -1 + x.data[0, :, 4:, :4] = 2 + x.data[0, :, 4:, 4:] = -3 + conv2d = QuantConv2d( + 'fp', 'ls-1', 3, 1, (4, 4), stride=4, bias=False + ) + y = conv2d(x).squeeze() + assert y.shape == (2, 2) + assert y[0, 0] == 0 + assert torch.isclose(y[1, 0], -2 * y[0, 1]) + assert torch.isclose(y[1, 1], 3 * y[0, 1]) + + +def test_quant_conv2d_parameter_group_keys(): + """Test parameter groups are separated correctly.""" + clamp = {'alpha': 2, 'kind': 'symmetric'} + conv2d = QuantConv2d( + 'ls-2', 'ls-1', 3, 1, (4, 4), clamp=clamp, stride=4, bias=False + ) + assert len(conv2d.quantized_parameters['fp']) == 0 + assert len(conv2d.quantized_parameters['ls-1']) == 1 + assert set(conv2d.quantized_parameters.keys()) - {'fp', 'ls-1'} == set() + assert len(list(conv2d.parameters())) == 1 + + conv2d = QuantConv2d('ls-2', 'ls-2', 3, 1, (4, 4), clamp=clamp, stride=4) + assert len(conv2d.quantized_parameters['fp']) == 1 + assert len(conv2d.quantized_parameters['ls-2']) == 1 + assert set(conv2d.quantized_parameters.keys()) - {'fp', 'ls-2'} == set() + assert len(list(conv2d.parameters())) == 2 + + +def test_quant_conv2d_combinations(): + """Test different combinations of configurations to see they can be created.""" + schemes = ['fp', 'ls-1', 'ls-2', 'ls-T', 'gf-2', 'gf-3'] + for x_scheme, w_scheme in itertools.product(schemes, schemes): + QuantConv2d(x_scheme, w_scheme, 3, 1, (4, 4)) + + with pytest.raises(ValueError): + QuantConv2d('ls', 'ls-1', 3, 1, (4, 4)) + + with pytest.raises(ValueError): + QuantConv2d('l2', 'ls-1', 3, 1, (4, 4)) + + with pytest.raises(ValueError): + QuantConv2d('ls-1', 'ls-3', 3, 1, (4, 4)) + + with pytest.raises(ValueError): + QuantConv2d('ls-1', 'l2', 3, 1, (4, 4)) + + with pytest.raises(ValueError): + QuantConv2d('ls-1', 'ls-2', 3, 1, (4, 4), clamp={'kind': 'sym'}) diff --git a/tests/binary/test_quantization.py b/tests/binary/test_quantization.py new file mode 100644 index 0000000..023d253 --- /dev/null +++ b/tests/binary/test_quantization.py @@ -0,0 +1,165 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test quantization functions.""" + +import torch + +import quant.binary.quantization as quantization +from quant.binary.ste import binarize, binary_sign + + +def test_clamp_identity(): + """Test identity clamp function.""" + x = torch.tensor([-1.0, 0.0, 1.0, 2.0]) + assert torch.all(x.eq(quantization.clamp_identity(x))) + + +def test_clamp_symmetric(): + """Test symmetric clamp function.""" + x = torch.tensor([-1.0, 0.0, 1.0, 2.0]) + assert torch.all(torch.tensor([-1, 0, 1, 1]).eq(quantization.clamp_symmetric(x, 1))) + assert torch.all(torch.tensor([-0.5, 0, 0.5, 0.5]).eq(quantization.clamp_symmetric(x, 0.5))) + assert torch.all(torch.tensor([-1, 0, 1, 2]).eq(quantization.clamp_symmetric(x, 2))) + assert torch.all(torch.tensor([-1, 0, 1, 2]).eq(quantization.clamp_symmetric(x, 3))) + + +def test_quantizer_fp(): + """Test full precision (identity) quantizer.""" + quantizer_fp = quantization.QuantizerFP() + x = torch.tensor([-1, 0, 1, 2]) + assert torch.all(x.eq(quantizer_fp(x))) + + +def test_quantizer_ls_1_optimal(): + """Test 1-bit optimal least-squares scaled binary quantization.""" + torch.manual_seed(1234) + x = torch.randn(1000, 3, 64, 64) + + _, x_q = quantization.quantizer_ls_1(x) + assert x_q.shape == x.shape + + # Check x_q has lower least-squares error compared with using random scaling factors + subopt_scaling_factor = torch.randn(1000, 1, 1, 1).abs() + subopt_quantization = subopt_scaling_factor * binarize(x) + opt_costs = torch.norm((x_q - x).view(1000, -1), dim=1) + subopt_costs = torch.norm((subopt_quantization - x).view(1000, -1), dim=1) + assert torch.all(opt_costs <= subopt_costs) + + +def test_quantizer_ls_2_optimal(): + """Test 2-bit optimal least squares scaled binary quantization.""" + torch.manual_seed(1234) + x = torch.randn(1000, 3, 64, 64) + + _, _, x_q = quantization.quantizer_ls_2(x, skip=1) + assert x_q.shape == x.shape + + # Check x_q has lower least-squares error compared with using random scaling factors + rand_indices = torch.randint(0, 3 * 64 * 64, (1000,)) + subopt_v1 = x.view(1000, -1)[torch.arange(1000), rand_indices].view(1000, 1).abs() + s2 = x.view(1000, -1) - subopt_v1 * binary_sign(x.view(1000, -1)) + subopt_v2 = s2.abs().mean(dim=-1, keepdim=True) + + b1 = binarize(x) + subopt_v1 = subopt_v1.view(1000, 1, 1, 1) + subopt_v2 = subopt_v2.view(1000, 1, 1, 1) + subopt_quantization = subopt_v1 * b1 + subopt_v2 * binarize(x - subopt_v1 * b1) + + opt_costs = torch.norm((x_q - x).view(1000, -1), dim=1) + subopt_costs = torch.norm((subopt_quantization - x).view(1000, -1), dim=1) + assert torch.all(opt_costs <= subopt_costs) + + +def test_quantizer_ls_T_optimal(): + """Test ternary optimal least squares scaled binary quantization.""" + torch.manual_seed(1234) + x = torch.randn(1000, 3, 64, 64) + + _, x_q = quantization.quantizer_ls_ternary(x, skip=1) + assert x_q.shape == x.shape + + # Check x_q has lower least-squares error compared with using random scaling factors + rand_indices = torch.randint(0, 3 * 64 * 64, (1000,)) + subopt_v1 = x.view(1000, -1)[torch.arange(1000), rand_indices].view(1000, 1, 1, 1).abs() + b1 = binarize(x) + subopt_quantization = subopt_v1 * b1 + subopt_v1 * binarize(x - subopt_v1 * b1) + + opt_costs = torch.norm((x_q - x).view(1000, -1), dim=1) + subopt_costs = torch.norm((subopt_quantization - x).view(1000, -1), dim=1) + assert torch.all(opt_costs <= subopt_costs) + + +def test_quantizer_ls_T_all_inputs_equal(): + """Test ternary optimal least squares scaled binary quantization edge case.""" + torch.manual_seed(1234) + x = torch.ones(32, 3, 16, 16) * 2 + _, x_q = quantization.quantizer_ls_ternary(x) + + assert torch.all(x_q == 2.0) + + # Test the case just certain rows have all elements equal + x = torch.rand(32, 3, 16, 16) + x[1, :, :, :] = torch.ones(3, 16, 16) * 2 + x[9, :, :, :] = torch.ones(3, 16, 16) * -3 + + _, x_q = quantization.quantizer_ls_ternary(x) + + assert torch.all(x_q[1, :, :, :] == 2) + assert torch.all(x_q[9, :, :, :] == -3) + + +def test_quantizer_gf_more_bits_are_better(): + """Test the more bits are used for gf, the better it is.""" + torch.manual_seed(1234) + x = torch.randn(1000, 3, 64, 64) + + _, x_q_gf1 = quantization.quantizer_gf(x, k=1) + _, x_q_gf2 = quantization.quantizer_gf(x, k=2) + _, x_q_gf3 = quantization.quantizer_gf(x, k=3) + _, x_q_gf4 = quantization.quantizer_gf(x, k=4) + + gf1_costs = torch.norm((x_q_gf1 - x).view(1000, -1), dim=1) + gf2_costs = torch.norm((x_q_gf2 - x).view(1000, -1), dim=1) + gf3_costs = torch.norm((x_q_gf3 - x).view(1000, -1), dim=1) + gf4_costs = torch.norm((x_q_gf4 - x).view(1000, -1), dim=1) + + assert torch.all(gf2_costs <= gf1_costs) + assert torch.all(gf3_costs <= gf2_costs) + assert torch.all(gf4_costs <= gf3_costs) + + +def test_quantizer_ls2_better_than_lsT(): + """Test ls-2 is better than ls-T, which is better than ls-1.""" + torch.manual_seed(1234) + x = torch.randn(1000, 3, 64, 64) + + _, _, x_q_ls2 = quantization.quantizer_ls_2(x, skip=1) + _, x_q_lsT = quantization.quantizer_ls_ternary(x, skip=1) + _, x_q_ls1 = quantization.quantizer_ls_1(x) + + ls2_costs = torch.norm((x_q_ls2 - x).view(1000, -1), dim=1) + lsT_costs = torch.norm((x_q_lsT - x).view(1000, -1), dim=1) + ls1_costs = torch.norm((x_q_ls1 - x).view(1000, -1), dim=1) + + assert torch.all(ls2_costs <= lsT_costs) + assert torch.all(lsT_costs <= ls1_costs) + + +def test_quantizer_ls2_better_than_gf2(): + """Test ls-2 is better than gf-2, which is better than ls-1.""" + torch.manual_seed(1234) + x = torch.randn(1000, 3, 64, 64) + + _, _, x_q_ls2 = quantization.quantizer_ls_2(x, skip=1) + _, x_q_gf2 = quantization.quantizer_gf(x, k=2) + _, x_q_ls1 = quantization.quantizer_ls_1(x) + + ls2_costs = torch.norm((x_q_ls2 - x).view(1000, -1), dim=1) + gf2_costs = torch.norm((x_q_gf2 - x).view(1000, -1), dim=1) + ls1_costs = torch.norm((x_q_ls1 - x).view(1000, -1), dim=1) + + assert torch.all(ls2_costs <= gf2_costs) + assert torch.all(gf2_costs <= ls1_costs) diff --git a/tests/binary/test_ste.py b/tests/binary/test_ste.py new file mode 100644 index 0000000..3821c95 --- /dev/null +++ b/tests/binary/test_ste.py @@ -0,0 +1,36 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test straight-through estimator.""" + +import torch + +from quant.binary.ste import binarize + + +def test_ste_sign_forward(): + """Test the forward pass of STESign.""" + x = torch.tensor([42, -42, 42, 42, 0, -1, 1, -4.2, 4.2]) + xb = binarize(x) + xb_expected = torch.tensor([1, -1, 1, 1, 1, -1, 1, -1, 1]) + assert torch.all(xb.eq(xb_expected)) + + +def test_ste_sign_backward_multiloss(): + """ + Test STESign backward computes gradient correctly. + + x = [x1, x2, ..., xn] + l = sum(sign(x)) + dl/dxi = 1 iff |xi| <= 1 + """ + x = torch.tensor([42, -42, 0, -1, 1, -0.2, 0.2], requires_grad=True) + + xb = binarize(x) + loss = xb.sum() + loss.backward() + + grad_expected = torch.tensor([0, 0, 1, 1, 1, 1, 1]) + assert torch.all(x.grad.eq(grad_expected)) diff --git a/tests/binary/test_weight_quantization.py b/tests/binary/test_weight_quantization.py new file mode 100644 index 0000000..7d2e330 --- /dev/null +++ b/tests/binary/test_weight_quantization.py @@ -0,0 +1,81 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test weight quantization functions classes.""" + +import torch + +import quant.binary.quantization as quantization +import quant.binary.weight_quantization as weight_quantization + + +def test_weight_quantizer_ls1_modes(): + """Test training mode and eval mode for WeightQuantizerLS1.""" + torch.manual_seed(1234) + quantizer_ls1 = weight_quantization.WeightQuantizerLS1(32) + w = torch.ones(32, 16, 3, 3) * 2 + + quantizer_ls1.train() + w_q_train = quantizer_ls1(w) # v1 should be 2 for all channels + assert torch.all(w_q_train == 2.0) + + quantizer_ls1.eval() + w = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + w_q_eval = quantizer_ls1(w) + + # since every element of matrix is quantized to +1, and scaling factor is 2 + assert torch.all(w_q_train.eq(w_q_eval)) + + +def test_weight_quantizer_ls2_modes(): + """Test training mode and eval mode for WeightQuantizerLS2.""" + torch.manual_seed(1234) + quantizer_ls2 = weight_quantization.WeightQuantizerLS2(32) + w = torch.ones(32, 16, 3, 3) * 2 + + quantizer_ls2.train() + w_q_train = quantizer_ls2(w) + assert torch.all(w_q_train == 2.0) + + quantizer_ls2.eval() + w = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + w_q_eval = quantizer_ls2(w) + + assert torch.all(w_q_train.eq(w_q_eval)) + + +def test_weight_quantizer_lsT_modes(): + """Test training mode and eval mode for WeightQuantizerLST.""" + torch.manual_seed(1234) + quantizer_lsT = weight_quantization.WeightQuantizerLST(32) + w = torch.rand(32, 16, 3, 3) + + quantizer_lsT.train() + _ = quantizer_lsT(w) + v1 = quantizer_lsT.v1 + + quantizer_lsT.eval() + w = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + w_q_eval = quantizer_lsT(w) + _, w_q_eval_expected = quantization.quantizer_ls_ternary(w, v1=v1) + + assert torch.all(w_q_eval.eq(w_q_eval_expected)) + + +def test_weight_quantizer_gf_modes(): + """Test training mode and eval mode for WeightQuantizerGF.""" + torch.manual_seed(1234) + quantizer_gf = weight_quantization.WeightQuantizerGF(32, 2) + w = torch.ones(32, 16, 3, 3) * 2 + + quantizer_gf.train() + w_q_train = quantizer_gf(w) + assert torch.all(w_q_train == 2.0) + + quantizer_gf.eval() + w = torch.rand(32, 16, 3, 3) # some random, but all positive tensor + w_q_eval = quantizer_gf(w) + + assert torch.all(w_q_train.eq(w_q_eval)) diff --git a/tests/common/__init__.py b/tests/common/__init__.py new file mode 100644 index 0000000..10c2c1e --- /dev/null +++ b/tests/common/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""This package contains tests for Quant common modules.""" diff --git a/tests/common/test_experiment.py b/tests/common/test_experiment.py new file mode 100644 index 0000000..9032f38 --- /dev/null +++ b/tests/common/test_experiment.py @@ -0,0 +1,33 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test running experiment on platform.""" + +import pytest + +from quant.common.compute_platform import LocalComputePlatform +from quant.common.experiment import Experiment +from quant.common.tasks import classification_task +from quant.utils.visualization import get_tensorboard_hooks + +from tests.data.helpers import get_base_config_template, RandomQuantDataLoader + + +@pytest.mark.slow +def test_run_experiment_on_platform(tmp_path): + config = get_base_config_template( + tmp_path, 'dummy_experiment', + {'x_quant': 'ls-2', 'w_quant': 'ls-1'} + ) + + platform = LocalComputePlatform(str(tmp_path)) + + experiment = Experiment( + classification_task, config, RandomQuantDataLoader, get_tensorboard_hooks + ) + platform.run(experiment) + + assert (tmp_path / experiment.name / 'config.yaml').exists() + assert (tmp_path / experiment.name / 'metrics' / 'test.csv').exists() diff --git a/tests/common/test_initialization.py b/tests/common/test_initialization.py new file mode 100644 index 0000000..adcb0f4 --- /dev/null +++ b/tests/common/test_initialization.py @@ -0,0 +1,165 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test initialization.""" + +from unittest.mock import patch + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.optim import Adam, SGD +import torch.optim.lr_scheduler as lr_scheduler + +from quant.common.initialization import _get_best_gpus, get_model, get_optimizer, get_lr_scheduler +from quant.models.lenet import QLeNet5 +from quant.utils.linear_lr_scheduler import LinearLR + + +def test_get_model_cpu(): + """Test get model factory on CPU.""" + arch = {'conv1_filters': 20, 'conv2_filters': 50, 'output_classes': 10} + model = get_model('lenet5', F.nll_loss, arch, torch.device('cpu'), 0) + + assert isinstance(model, QLeNet5) + assert model.loss_fn == F.nll_loss + assert next(model.parameters()).device.type == 'cpu' + + +def test_get_model_single_gpu(): + """Test get model factory on single GPU.""" + if not torch.cuda.is_available(): + return + + arch = {'conv1_filters': 20, 'conv2_filters': 50, 'output_classes': 10} + model = get_model('lenet5', F.nll_loss, arch, torch.device('cuda:0'), 1) + + assert isinstance(model, QLeNet5) + assert model.loss_fn == F.nll_loss + assert next(model.parameters()).device.type == 'cuda' + + +def test_get_model_multi_gpu(): + """Test get model factory on single GPU.""" + if torch.cuda.device_count() <= 1: + return + + arch = {'conv1_filters': 20, 'conv2_filters': 50, 'output_classes': 10} + model = get_model('lenet5', F.nll_loss, arch, torch.device('cuda:0'), 2) + + assert isinstance(model, nn.DataParallel) + assert model.module.loss_fn == F.nll_loss + + +@patch('torch.cuda.device_count') +@patch('torch.cuda.get_device_capability') +def test_get_best_gpus(capability_mock, device_count_mock): + """Test _get_best_gpus returns the best GPUs.""" + def device_capability_side_effect(device_id): + if device_id == 0: + return 6, 0 + if device_id == 1: + return 7, 5 + if device_id == 2: + return 6, 5 + + assert torch.cuda.device_count is device_count_mock + assert torch.cuda.get_device_capability is capability_mock + device_count_mock.return_value = 3 + capability_mock.side_effect = device_capability_side_effect + + device_ids = _get_best_gpus(2) + + assert set(device_ids) == {1, 2} + + +def test_get_optimizer(): + """Test get optimizer factory.""" + model = QLeNet5(F.nll_loss) + + optimizer = get_optimizer(model.parameters(), {'algorithm': 'sgd', 'lr': 0.1}) + assert isinstance(optimizer, SGD) + + optimizer = get_optimizer(model.parameters(), {'algorithm': 'adam', 'lr': 0.1}) + assert isinstance(optimizer, Adam) + + +def test_get_linear_lr_scheduler(): + """Test get linear lr scheduler.""" + model = QLeNet5(F.nll_loss) + optimizer = get_optimizer(model.parameters(), {'algorithm': 'sgd', 'lr': 0.1}) + + scheduler = get_lr_scheduler( + optimizer, + {'scheduler': 'linear_lr', 'min_lr': 1e-5}, 80, 100 + ) + + assert isinstance(scheduler, LinearLR) + # This test just check we can construct a LinearLR, + # test_linear_lr_scheduler actually tests its behavior + + +def test_get_step_lr_scheduler(): + """Test get step lr scheduler.""" + model = QLeNet5(F.nll_loss) + optimizer = get_optimizer(model.parameters(), {'algorithm': 'sgd', 'lr': 0.1}) + + scheduler = get_lr_scheduler( + optimizer, + {'scheduler': 'step_lr', 'step_size': 1, 'gamma': 0.7}, 5, 100 + ) + + assert isinstance(scheduler, lr_scheduler.StepLR) + for _ in range(100): + assert optimizer.param_groups[0]['lr'] == 0.1 + optimizer.step() + scheduler.step() + + assert optimizer.param_groups[0]['lr'] == 0.7 * 0.1 + + +def test_get_multi_step_lr_scheduler(): + """Test get multi step lr scheduler.""" + model = QLeNet5(F.nll_loss) + optimizer = get_optimizer(model.parameters(), {'algorithm': 'sgd', 'lr': 0.1}) + scheduler = get_lr_scheduler( + optimizer, + {'scheduler': 'multi_step_lr', 'milestones': [30, 70], 'gamma': 0.7}, 70, 100 + ) + + assert isinstance(scheduler, lr_scheduler.MultiStepLR) + for _ in range(30 * 100): + assert optimizer.param_groups[0]['lr'] == 0.1 + optimizer.step() + scheduler.step() + + for _ in range(40 * 100): + assert optimizer.param_groups[0]['lr'] == 0.7 * 0.1 + optimizer.step() + scheduler.step() + + assert optimizer.param_groups[0]['lr'] == 0.7 * 0.7 * 0.1 + + +def test_get_lambda_lr_scheduler(): + """Test get lambda lr scheduler.""" + model = QLeNet5(F.nll_loss) + optimizer = get_optimizer(model.parameters(), {'algorithm': 'sgd', 'lr': 0.1}) + + lr_lambda = """lambda s: next( + v for (a, b), v in {(0, 200): 1, (200, 1000): 0.75}.items() if a <= s < b + )""" + scheduler = get_lr_scheduler( + optimizer, + {'scheduler': 'lambda_lr', 'lr_lambda': lr_lambda}, 10, 100 + ) + + assert isinstance(scheduler, lr_scheduler.LambdaLR) + for _ in range(200): + assert optimizer.param_groups[0]['lr'] == 0.1 + optimizer.step() + scheduler.step() + + assert optimizer.param_groups[0]['lr'] == 0.75 * 0.1 diff --git a/tests/common/test_metrics.py b/tests/common/test_metrics.py new file mode 100644 index 0000000..926d5b7 --- /dev/null +++ b/tests/common/test_metrics.py @@ -0,0 +1,155 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test metrics.""" + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F + +from quant.common.metrics import LossMetric, Top1Accuracy, TopKAccuracy + + +def test_loss_metric_no_accumulate(): + """Test loss metric returns correct value with no accumulate.""" + criterion = F.nll_loss + metric = LossMetric(criterion, accumulate=False) + model = nn.LogSoftmax(dim=1) + X = torch.randn(3, 5) + output = model(X) + target = torch.tensor([1, 0, 4]) + metric.update(output, target) + + assert F.nll_loss(output, target).item() == metric.compute() + + # Check this is true after re-computation + assert F.nll_loss(output, target).item() == metric.compute() + + # Check update + Y = torch.randn(3, 5) + output2 = model(Y) + metric.update(output2, target) + assert F.nll_loss(output2, target).item() == metric.compute() + + # Check this is true after reset & re-computation + metric.reset() + metric.update(output, target) + assert F.nll_loss(output, target).item() == metric.compute() + + +def test_loss_metric_accumulate(): + """Test loss metric returns correct value with accumulate.""" + criterion = F.nll_loss + metric = LossMetric(criterion, accumulate=True) + model = nn.LogSoftmax(dim=1) + X = torch.randn(3, 5) + output = model(X) + target = torch.tensor([1, 0, 4]) + metric.update(output, target) + + assert F.nll_loss(output, target).item() == pytest.approx(metric.compute()) + + # Check this is true after re-computation + assert F.nll_loss(output, target).item() == pytest.approx(metric.compute()) + + # Check update + Y = torch.randn(3, 5) + output2 = model(Y) + metric.update(output2, target) + assert F.nll_loss(torch.cat([output, output2]), torch.cat([target, target])).item() \ + == pytest.approx(metric.compute()) + + # Check this is true after reset & re-computation + metric.reset() + metric.update(output, target) + assert F.nll_loss(output, target).item() == pytest.approx(metric.compute()) + + +def test_top_1_accuracy_metric_no_accumulate(): + """Test top-1 accuracy metric returns correct value with no accumulate.""" + metric = Top1Accuracy(accumulate=False) + + metric.update(torch.tensor([[0.1, 0.2, 0.3]]), torch.tensor([2])) + assert metric.compute() == 1.0 + + # Check this is true after re-computation + assert metric.compute() == 1.0 + + metric.update(torch.tensor([[0.1, 0.2, 0.3]]), torch.tensor([1])) + assert metric.compute() == 0 + + # Check after reset & re-computation + metric.reset() + metric.update(torch.tensor([[0.1, 0.2, 0.3]]), torch.tensor([2])) + assert metric.compute() == 1.0 + + +def test_top_1_accuracy_metric_accumulate(): + """Test top-1 accuracy metric returns correct value with accumulate.""" + metric = Top1Accuracy(accumulate=True) + + metric.update(torch.tensor([[0.1, 0.2, 0.3]]), torch.tensor([2])) + assert metric.compute() == 1.0 + + metric.update(torch.tensor([[0.1, 0.2, 0.3]]), torch.tensor([1])) + assert metric.compute() == 0.5 + + metric.update(torch.tensor([[0.1, 0.2, 0.3]]), torch.tensor([0])) + assert metric.compute() == 1 / 3 + + # Check this is true after re-computation + assert metric.compute() == 1 / 3 + + # Check this is true after reset & re-computation + metric.reset() + metric.update(torch.tensor([[0.1, 0.2, 0.3]]), torch.tensor([2])) + assert metric.compute() == 1.0 + + +def test_top_k_accuracy_metric_no_accumulate(): + """Test top-k accuracy metric returns correct value with no accumulate.""" + output = torch.tensor([[0.1, 0.2, 0.3, 0, 0.5], + [0.2, 0.3, 0.4, 0.1, 0]]) + + metric_k2 = TopKAccuracy(2, accumulate=False) + metric_k2.update(output, torch.tensor([4, 0])) + assert metric_k2.compute() == 0.5 + + metric_k2.update(torch.tensor([[0.1, 0.5, 0.3, 0.2, 0.4]]), torch.tensor([1])) + assert metric_k2.compute() == 1.0 + + # Check re-computation does not change value + assert metric_k2.compute() == 1.0 + + # Check reset works + metric_k2.reset() + metric_k2.update(output, torch.tensor([4, 0])) + assert metric_k2.compute() == 0.5 + + +def test_top_k_accuracy_metric_accumulate(): + """Test top-k accuracy metric returns correct value with accumulate.""" + output = torch.tensor([[0.1, 0.2, 0.3, 0, 0.5], + [0.2, 0.3, 0.4, 0.1, 0]]) + + metric_k2 = TopKAccuracy(2, accumulate=True) + metric_k2.update(output, torch.tensor([4, 0])) + assert metric_k2.compute() == 0.5 + + metric_k3 = TopKAccuracy(3, accumulate=True) + metric_k3.update(output, torch.tensor([4, 0])) + assert metric_k3.compute() == 1.0 + + metric_k2.update(torch.tensor([[0.1, 0.5, 0.3, 0.2, 0.4]]), torch.tensor([1])) + assert metric_k2.compute() == 2 / 3 + + # Check re-computation does not change value + assert metric_k2.compute() == 2 / 3 + + # Check reset works + metric_k2.reset() + metric_k2.update(output, torch.tensor([4, 0])) + assert metric_k2.compute() == 0.5 diff --git a/tests/common/test_parser.py b/tests/common/test_parser.py new file mode 100644 index 0000000..7f4f769 --- /dev/null +++ b/tests/common/test_parser.py @@ -0,0 +1,45 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test parser.""" + +import pytest +import torch + +from quant.common.parser import get_base_argument_parser, parse_config + + +@pytest.fixture() +def base_parser(): + """Fixture for base argument parser.""" + return get_base_argument_parser('base parser') + + +def test_standard_args(base_parser): + """Test parsing standard arguments.""" + args = base_parser.parse_args('--config examples/mnist/mnist_fp.yaml'.split(' ')) + config = parse_config(args) + + assert isinstance(config['experiment_name'], str) and len(config['experiment_name']) + assert config['environment']['platform'] == 'local' + assert config['environment']['ngpus'] == (1 if torch.cuda.is_available() else 0) + assert 'init_from_checkpoint' not in config + assert 'restore_experiment' not in config + assert not config['skip_training'] + + +def test_missing_config(base_parser): + """Test missing config.""" + args = base_parser.parse_args([]) + with pytest.raises(ValueError): + parse_config(args) + + +def test_gpu_override(base_parser): + """Test CLI ngpus argument can override what is in the config.""" + args = base_parser.parse_args('--config examples/mnist/mnist_fp.yaml --ngpus 8'.split(' ')) + config = parse_config(args) + + assert config['environment']['ngpus'] == 8 diff --git a/tests/common/test_tasks.py b/tests/common/test_tasks.py new file mode 100644 index 0000000..5a39426 --- /dev/null +++ b/tests/common/test_tasks.py @@ -0,0 +1,110 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test for running tasks.""" + +from quant.common.tasks import classification_task +from quant.utils.visualization import get_tensorboard_hooks +import pytest +import yaml + +from tests.data.helpers import get_base_config_template, RandomQuantDataLoader + + +@pytest.mark.incremental +class TestRunClassificationTask: + + arch_variants = [ + {'x_quant': 'fp', 'w_quant': 'fp'}, + {'x_quant': 'ls-2', 'w_quant': 'ls-1'}, + {'x_quant': 'gf-2', 'w_quant': 'ls-1'}, + {'x_quant': 'ls-2', 'w_quant': 'ls-1', + 'moving_average_mode': 'eval_only', 'moving_average_momentum': 0.9}, + {'x_quant': 'ls-1', 'w_quant': 'ls-1', + 'moving_average_mode': 'train_and_eval', 'moving_average_momentum': 0.9} + ] + + def test_train_regular_classification_task(self, tmp_path_factory): + """Train a model from scratch, which will be used as the teacher.""" + for i, arch_variant in enumerate(self.arch_variants): + base_dir = tmp_path_factory.getbasetemp() + config = get_base_config_template(base_dir, f'teacher_{i}', arch_variant) + classification_task( + config, + base_dir / 'experiments', + RandomQuantDataLoader, + get_tensorboard_hooks + ) + + with open(str(base_dir / 'experiments' / f'teacher_{i}' / 'config.yaml'), 'w') as f: + yaml.dump(config, f) + + def test_init_from_checkpoint(self, tmp_path_factory): + """Test initializing from checkpoint.""" + for i, arch_variant in enumerate(self.arch_variants): + base_dir = tmp_path_factory.getbasetemp() + config = get_base_config_template(base_dir, f'init_from_checkpoint_{i}', arch_variant) + config['init_from_checkpoint'] = str( + base_dir / 'experiments' / f'teacher_{i}' / 'checkpoints' / 'checkpoint_1.pt' + ) + classification_task( + config, + base_dir / 'experiments', + RandomQuantDataLoader, + get_tensorboard_hooks + ) + + def test_skip_training(self, tmp_path_factory): + """Test only doing inference.""" + for i, arch_variant in enumerate(self.arch_variants): + base_dir = tmp_path_factory.getbasetemp() + config = get_base_config_template(base_dir, f'skip_training_{i}', arch_variant) + config['skip_training'] = True + config['init_from_checkpoint'] = str( + base_dir / 'experiments' / f'teacher_{i}' / 'checkpoints' / 'checkpoint_1.pt' + ) + classification_task( + config, + base_dir / 'experiments', + RandomQuantDataLoader, + get_tensorboard_hooks + ) + + def test_restore_from_experiment(self, tmp_path_factory): + """Test restoring from experiment.""" + for i, arch_variant in enumerate(self.arch_variants): + base_dir = tmp_path_factory.getbasetemp() + config = get_base_config_template(base_dir, f'restore_experiment_{i}', arch_variant) + classification_task( + config, + base_dir / 'experiments', + RandomQuantDataLoader, + get_tensorboard_hooks, + base_dir / 'experiments' / f'teacher_{i}' + ) + + def test_train_student(self, tmp_path_factory): + """Train a student model using the teacher from above.""" + for i, arch_variant in enumerate(self.arch_variants): + base_dir = tmp_path_factory.getbasetemp() + config = get_base_config_template(base_dir, f'student_{i}', arch_variant) + config['model']['kd_config'] = { + 'teacher_config_path': str( + base_dir / 'experiments' / f'teacher_{i}' / 'config.yaml' + ), + 'teacher_checkpoint_path': str( + base_dir / 'experiments' / f'teacher_{i}' / 'checkpoints' / 'checkpoint_1.pt' + ), + 'freeze_teacher': True, + 'train_mode': True, + 'criterion_config': {'temperature': 1} + } + + classification_task( + config, + base_dir / 'experiments', + RandomQuantDataLoader, + get_tensorboard_hooks + ) diff --git a/tests/common/test_training.py b/tests/common/test_training.py new file mode 100644 index 0000000..3b88038 --- /dev/null +++ b/tests/common/test_training.py @@ -0,0 +1,75 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test training and testing loop.""" + +import unittest.mock as mock + +import pytest +import torch +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from quant.common.initialization import get_optimizer, get_lr_scheduler +from quant.common.metrics import LossMetric +from quant.common.training import train, evaluate +from quant.models.lenet import QLeNet5 + +from tests.data.helpers import RandomDataset + + +@pytest.fixture +def random_data_loader(): + torch.manual_seed(260) + loader = DataLoader(RandomDataset(2), batch_size=32, num_workers=4, shuffle=False) + return loader + + +def test_training_loop(random_data_loader): + """Test the training loop.""" + device = torch.device('cpu') + model = QLeNet5(F.nll_loss).to(device) + metrics = { + 'Loss': LossMetric(model.loss_fn, accumulate=False) + } + optimizer = get_optimizer(model.parameters(), {'algorithm': 'sgd', 'lr': 0.1}) + scheduler = get_lr_scheduler( + optimizer, + {'scheduler': 'step_lr', 'step_size': 1, 'gamma': 0.7}, + 3, + len(random_data_loader) + ) + fake_hook = mock.MagicMock() + hooks = [fake_hook] + + losses = [] + for epoch in range(1, 3): + train( + model=model, train_loader=random_data_loader, metrics=metrics, + optimizer=optimizer, scheduler=scheduler, device=device, epoch=epoch, + log_interval=4, hooks=hooks + ) + losses.append(metrics['Loss'].compute()) + + # Ensure that hooks are called and loss is changing + assert fake_hook.called + assert losses[1] != losses[0] + + +def test_test_loop(random_data_loader): + """Test the test loop.""" + device = torch.device('cpu') + model = QLeNet5(F.nll_loss).to(device) + metrics = { + 'Loss': LossMetric(model.loss_fn, accumulate=False) + } + fake_hook = mock.MagicMock() + hooks = [fake_hook] + evaluate(model=model, test_loader=random_data_loader, metrics=metrics, device=device, + epoch=1, hooks=hooks) + + # Ensure that hooks are called and metric has value + assert fake_hook.called + assert isinstance(metrics['Loss'].compute(), float) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..6b5cd38 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,78 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# +"""Configuration for pytest.""" + +from typing import Dict, Tuple + +import pytest + +# store history of failures per test class name and per index in parametrize (if parametrize used) +_test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {} + + +def pytest_addoption(parser): + parser.addoption( + '--runslow', action='store_true', default=False, help='run slow tests' + ) + + +def pytest_configure(config): + config.addinivalue_line('markers', 'slow: mark test as slow to run') + + +def pytest_collection_modifyitems(config, items): + if config.getoption('--runslow'): + # --runslow given in cli: do not skip slow tests + return + skip_slow = pytest.mark.skip(reason='need --runslow option to run') + for item in items: + if 'slow' in item.keywords: + item.add_marker(skip_slow) + + +# The two hooks below implement "incremental testing" +# If one step fails, further steps are not continued +# The code is from the pytest docs: https://docs.pytest.org/en/latest/example/simple.html + +def pytest_runtest_makereport(item, call): + if 'incremental' in item.keywords: + # incremental marker is used + if call.excinfo is not None: + # the test has failed + # retrieve the class name of the test + cls_name = str(item.cls) + # retrieve the index of the test + # (if parametrize is used in combination with incremental) + parametrize_index = ( + tuple(item.callspec.indices.values()) + if hasattr(item, 'callspec') + else () + ) + # retrieve the name of the test function + test_name = item.originalname or item.name + # store in _test_failed_incremental the original name of the failed test + _test_failed_incremental.setdefault(cls_name, {}).setdefault( + parametrize_index, test_name + ) + + +def pytest_runtest_setup(item): + if 'incremental' in item.keywords: + # retrieve the class name of the test + cls_name = str(item.cls) + # check if a previous test has failed for this class + if cls_name in _test_failed_incremental: + # retrieve the index of the test + # (if parametrize is used in combination with incremental) + parametrize_index = ( + tuple(item.callspec.indices.values()) + if hasattr(item, 'callspec') + else () + ) + # retrieve the name of the first test function to fail for this class name and index + test_name = _test_failed_incremental[cls_name].get(parametrize_index, None) + # if name found, test has failed for the combination of class name & test name + if test_name is not None: + pytest.xfail('previous test failed ({})'.format(test_name)) diff --git a/tests/data/__init__.py b/tests/data/__init__.py new file mode 100644 index 0000000..d99face --- /dev/null +++ b/tests/data/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""This package contains all tests for data-related modules.""" diff --git a/tests/data/helpers.py b/tests/data/helpers.py new file mode 100644 index 0000000..3226401 --- /dev/null +++ b/tests/data/helpers.py @@ -0,0 +1,114 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Helpers for data loader tests.""" + +import typing as t + +import torch +from torch.utils.data import Dataset, Sampler +from torch.utils.data.dataloader import DataLoader + +from quant.data.data_loaders import QuantDataLoader + + +class RandomDataset(Dataset): + def __init__(self, num_classes: int): + self.nc = num_classes + + def __len__(self): + return 256 + + def __getitem__(self, index): + # return (data, target) as a tuple + return torch.normal(mean=0, std=1, size=(1, 28, 28)), torch.randint(0, self.nc, (1,)).item() + + +class RandomQuantDataLoader(QuantDataLoader): + + def __init__( + self, + train_batch_size: int, + test_batch_size: int, + dataset_path: str, + workers: int, + download: bool = False, + test_sampler: t.Optional[Sampler] = None, + num_classes: int = 10, + ): + """Construct a class for getting RandomQuantDataLoader data loaders.""" + super(RandomQuantDataLoader, self).__init__( + train_batch_size, + test_batch_size, + dataset_path, + workers, + download, + test_sampler, + ) + self.num_classes = num_classes + + def get_train_loader(self) -> DataLoader: + """Get a PyTorch data loader for the training set.""" + train_loader = DataLoader( + RandomDataset(self.num_classes), batch_size=self.train_batch_size, shuffle=False + ) + + return train_loader + + def get_test_loader(self) -> DataLoader: + """Get a PyTorch data loader for the test set.""" + test_loader = DataLoader( + RandomDataset(self.num_classes), batch_size=self.test_batch_size, + shuffle=False, sampler=self.test_sampler + ) + + return test_loader + + +def get_base_config_template(tmp_path, exp_name, arch_variant): + base_template = { + 'environment': {'ngpus': 0}, + 'experiment_name': exp_name, + 'skip_training': False, + 'data': { + 'dataset_path': str(tmp_path / 'data'), + 'train_batch_size': 64, + 'test_batch_size': 64, + 'workers': 4 + }, + 'model': { + 'architecture': 'lenet5', + 'loss': 'nll_loss', + 'arch_config': { + 'conv1_filters': 2, + 'conv2_filters': 5, + 'output_classes': 10 + } + }, + 'optimization': { + 'epochs': 1, + 'optimizer': { + 'algorithm': 'adadelta', + 'lr': 0.1 + }, + 'lr_scheduler': { + 'scheduler': 'step_lr', + 'step_size': 1, + 'gamma': 0.9 + } + }, + 'log': { + 'level': 'INFO', + 'interval': 10, + 'tensorboard': True, + 'tensorboard_root': str(tmp_path / 'tb_runs'), + 'root_experiments_dir': str(tmp_path / 'experiments'), + 'save_model_freq': 1 + } + } + + base_template['model']['arch_config'].update(arch_variant) + + return base_template diff --git a/tests/data/test_data_loaders.py b/tests/data/test_data_loaders.py new file mode 100644 index 0000000..72a3b13 --- /dev/null +++ b/tests/data/test_data_loaders.py @@ -0,0 +1,87 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# +"""Tests for data loaders.""" + +import pytest +from torch.utils.data.sampler import SubsetRandomSampler + +from quant.data.data_loaders import ( + MNISTDataLoader, + CIFAR10DataLoader, + CIFAR100DataLoader, +) + + +@pytest.mark.slow +def test_mnist_data_loader(tmp_path_factory): + """Test MNIST data loading.""" + mnist_dir = tmp_path_factory.getbasetemp() / 'MNIST' + for download in (True, False): + mnist_data_loader = MNISTDataLoader(32, 32, mnist_dir, 4, download=download) + + train_loader = mnist_data_loader.get_train_loader() + assert len(train_loader.dataset) == 60000 + + test_loader = mnist_data_loader.get_test_loader() + assert len(test_loader.dataset) == 10000 + + subset_up_to = 64 + sampler = SubsetRandomSampler(range(subset_up_to)) + mnist_data_loader = MNISTDataLoader( + 32, 32, mnist_dir, 4, download=False, test_sampler=sampler + ) + + test_loader = mnist_data_loader.get_test_loader() + assert len(test_loader) == subset_up_to / 32 + + +@pytest.mark.slow +def test_cifar10_data_loader(tmp_path_factory): + """Test CIFAR-10 data loading.""" + cifar10_dir = tmp_path_factory.getbasetemp() / 'CIFAR-10' + for download in (True, False): + cifar10_data_loader = CIFAR10DataLoader( + 32, 32, cifar10_dir, 4, download=download + ) + + train_loader = cifar10_data_loader.get_train_loader() + assert len(train_loader.dataset) == 50000 + + test_loader = cifar10_data_loader.get_test_loader() + assert len(test_loader.dataset) == 10000 + + subset_up_to = 64 + sampler = SubsetRandomSampler(range(subset_up_to)) + cifar10_data_loader = CIFAR10DataLoader( + 32, 32, cifar10_dir, 4, download=False, test_sampler=sampler + ) + + test_loader = cifar10_data_loader.get_test_loader() + assert len(test_loader) == subset_up_to / 32 + + +@pytest.mark.slow +def test_cifar100_data_loader(tmp_path_factory): + """Test CIFAR-100 data loading.""" + cifar100_dir = tmp_path_factory.getbasetemp() / 'CIFAR-100' + for download in (True, False): + cifar100_data_loader = CIFAR100DataLoader( + 32, 32, cifar100_dir, 4, download=download + ) + + train_loader = cifar100_data_loader.get_train_loader() + assert len(train_loader.dataset) == 50000 + + test_loader = cifar100_data_loader.get_test_loader() + assert len(test_loader.dataset) == 10000 + + subset_up_to = 64 + sampler = SubsetRandomSampler(range(subset_up_to)) + cifar100_data_loader = CIFAR100DataLoader( + 32, 32, cifar100_dir, 4, download=False, test_sampler=sampler + ) + + test_loader = cifar100_data_loader.get_test_loader() + assert len(test_loader) == subset_up_to / 32 diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 0000000..7f84ac2 --- /dev/null +++ b/tests/models/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""This package contains tests for models modules.""" diff --git a/tests/models/test_resnet.py b/tests/models/test_resnet.py new file mode 100644 index 0000000..cc99923 --- /dev/null +++ b/tests/models/test_resnet.py @@ -0,0 +1,136 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test ResNet.""" + +import torch +import torch.nn.functional as F + +from quant.models.resnet import QResNet + + +REGULAR_BASIC_CONFIG = { + "block": "regular", + "layer0": { + "bias": False, + "kernel_size": 7, + "maxpool": {"kernel_size": 3, "padding": 1, "stride": 2, "type": "maxpool2d"}, + "n_in_channels": 64, + "padding": 3, + "stride": 2, + }, + "layer1": {"clamp": {"kind": "identity"}, "w_quant": "fp", "x_quant": "fp"}, + "layer2": {"clamp": {"kind": "identity"}, "w_quant": "fp", "x_quant": "fp"}, + "layer3": {"clamp": {"kind": "identity"}, "w_quant": "fp", "x_quant": "fp"}, + "layer4": {"clamp": {"kind": "identity"}, "w_quant": "fp", "x_quant": "fp"}, + "nonlins": ["relu", "relu"], + "num_blocks": [2, 2, 2, 2], + "output_classes": 1000, +} + +XNOR_BASIC_CONFIG = { + "block": "xnor", + "layer0": { + "bias": False, + "kernel_size": 7, + "maxpool": {"kernel_size": 3, "padding": 1, "stride": 2, "type": "maxpool2d"}, + "n_in_channels": 64, + "padding": 3, + "stride": 2, + }, + "layer1": { + "clamp": {"alpha": 2, "kind": "symmetric"}, + "double_shortcut": False, + "w_quant": "ls-1", + "x_quant": "ls-1", + }, + "layer2": { + "clamp": {"alpha": 2, "kind": "symmetric"}, + "double_shortcut": False, + "w_quant": "ls-1", + "x_quant": "ls-1", + }, + "layer3": { + "clamp": {"alpha": 2, "kind": "symmetric"}, + "double_shortcut": False, + "w_quant": "ls-1", + "x_quant": "ls-1", + }, + "layer4": { + "clamp": {"alpha": 2, "kind": "symmetric"}, + "double_shortcut": False, + "w_quant": "ls-1", + "x_quant": "ls-1", + }, + "nonlins": ["prelu", "prelu"], + "num_blocks": [2, 2, 2, 2], + "output_classes": 1000, +} + +XNOR_BASIC_DOUBLE_SC_CONFIG = { + "block": "xnor", + "layer0": { + "bias": False, + "kernel_size": 7, + "maxpool": {"kernel_size": 3, "padding": 1, "stride": 2, "type": "maxpool2d"}, + "n_in_channels": 64, + "padding": 3, + "stride": 2, + }, + "layer1": { + "clamp": {"alpha": 2, "kind": "symmetric"}, + "double_shortcut": True, + "w_quant": "ls-1", + "x_quant": "ls-1", + }, + "layer2": { + "clamp": {"alpha": 2, "kind": "symmetric"}, + "double_shortcut": True, + "w_quant": "ls-1", + "x_quant": "ls-1", + }, + "layer3": { + "clamp": {"alpha": 2, "kind": "symmetric"}, + "double_shortcut": True, + "w_quant": "ls-1", + "x_quant": "ls-1", + }, + "layer4": { + "clamp": {"alpha": 2, "kind": "symmetric"}, + "double_shortcut": True, + "w_quant": "ls-1", + "x_quant": "ls-1", + }, + "nonlins": ["prelu", "prelu"], + "num_blocks": [2, 2, 2, 2], + "output_classes": 1000, +} + + +def test_regular_basic_block_forward(): + """Test forward pass of regular basic block.""" + torch.manual_seed(1234) + x = torch.randn(4, 3, 32, 32) + resnet = QResNet(loss_fn=F.cross_entropy, **REGULAR_BASIC_CONFIG) + y = resnet(x) + assert y.shape == (4, 1000) + + +def test_xnor_basic_block_forward(): + """Test forward pass of xnor basic block.""" + torch.manual_seed(1234) + x = torch.randn(4, 3, 32, 32) + resnet = QResNet(loss_fn=F.cross_entropy, **XNOR_BASIC_CONFIG) + y = resnet(x) + assert y.shape == (4, 1000) + + +def test_xnor_basic_with_double_shortcut_forward(): + """Test forward pass of xnor basic block with double shortcut.""" + torch.manual_seed(1234) + x = torch.randn(4, 3, 32, 32) + resnet = QResNet(loss_fn=F.cross_entropy, **XNOR_BASIC_DOUBLE_SC_CONFIG) + y = resnet(x) + assert y.shape == (4, 1000) diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000..265e445 --- /dev/null +++ b/tests/utils/__init__.py @@ -0,0 +1,6 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""This package contains tests for utils modules.""" diff --git a/tests/utils/test_linear_lr_scheduler.py b/tests/utils/test_linear_lr_scheduler.py new file mode 100644 index 0000000..16f9631 --- /dev/null +++ b/tests/utils/test_linear_lr_scheduler.py @@ -0,0 +1,40 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test linear learning rate scheduler.""" + +import math + +import pytest +import torch.nn as nn +import torch.optim as optim + +from quant.utils.linear_lr_scheduler import LinearLR + + +def test_linear_lr_scheduler(): + """Test linear lr scheduler.""" + model = nn.Conv2d(3, 32, (2, 2), bias=False) + optimizer = optim.Adam(model.parameters(), lr=0.0002) + epochs = 120 + total_examples = 1281167 + batch_size = 256 + steps_per_epoch = int(math.ceil(total_examples / batch_size)) + scheduler = LinearLR(optimizer, 2e-7, epochs, steps_per_epoch) + + lrs = [] + for epoch in range(epochs): + for batch in range(steps_per_epoch): + lrs.append(optimizer.param_groups[0]['lr']) + optimizer.step() + scheduler.step() + + assert lrs[0] == 0.0002 + assert pytest.approx(lrs[1], 0.000199999663866, 1e-14) + assert pytest.approx(lrs[2], 0.000199999327731, 1e-14) + assert pytest.approx(lrs[80], 0.000199973109244, 1e-14) + assert pytest.approx(lrs[160], 0.000199946218487, 1e-14) + assert pytest.approx(lrs[60000], 0.000179831932773, 1e-14) + assert lrs[epochs * steps_per_epoch - 1] == 2e-7 diff --git a/tests/utils/test_moving_average.py b/tests/utils/test_moving_average.py new file mode 100644 index 0000000..2fe0762 --- /dev/null +++ b/tests/utils/test_moving_average.py @@ -0,0 +1,166 @@ +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# + +"""Test moving average.""" + +import pytest +import torch +import torch.nn as nn + +from quant.binary.activation_quantization import ActivationQuantizerLS1 +from quant.utils.moving_average import MovingAverage +from quant.binary.quantization import quantizer_ls_1 + + +def test_moving_average(): + """Test moving average.""" + x = torch.tensor([1.0]) + moving_avg = MovingAverage(momentum=torch.tensor([0.9])) + assert moving_avg(x) == x + + x = torch.tensor([2.0]) + assert torch.allclose(moving_avg(x), torch.tensor([0.9 * 1 + 0.1 * x])) + prev_result = torch.tensor([0.9 * 1 + 0.1 * x]) + + x = torch.tensor([3.0]) + assert torch.allclose(moving_avg(x), torch.tensor([0.9 * prev_result + 0.1 * x])) + + +def test_moving_average_multiple_momentum(): + """Test moving average with different momentum.""" + x = torch.tensor([2.0, 2.0]) + moving_avg = MovingAverage(momentum=torch.tensor([0.1, 0.2])) + assert torch.allclose(moving_avg(x), x) + + x = torch.tensor([4.0, 4.0]) + assert torch.allclose(moving_avg(x), torch.tensor([3.8, 3.6])) + + +def _compute_moving_average_closed_form(i, alpha): + """Compute the moving average for consecutive positive integers with momentum alpha.""" + return (alpha ** (i + 1) - (i + 1) * alpha + i) / (1 - alpha) + + +def test_moving_average_train_and_eval(): + """Test moving average with train_and_eval mode set in activation quantizer.""" + alpha = 0.9 + + devices = [torch.device('cpu')] + if torch.cuda.is_available(): + devices.append(torch.device('cuda:0')) + + for device in devices: + activation_quantizer = ActivationQuantizerLS1('train_and_eval', alpha) + activation_quantizer.to(device) + activation_quantizer.train() + for i in range(10): + x = i * torch.ones(8, 1, 20, 20, requires_grad=True, device=device) + x_q = activation_quantizer(x) + x_q.sum().backward() + + # Moving average internal statistics should be updated + actual_ma = activation_quantizer.moving_avg_module.moving_average + ma_i = _compute_moving_average_closed_form(i, alpha) + expected_ma = torch.tensor(ma_i, device=device).expand_as(actual_ma) + assert torch.allclose(expected_ma, actual_ma) + + # Quantization should be computed from moving average scalars + _, expected_quantization = quantizer_ls_1( + x, torch.tensor([ma_i], device=device).expand(8) + ) + assert torch.allclose(expected_quantization, x_q) + + activation_quantizer.eval() + for i in range(5): + x = i * torch.ones(8, 1, 20, 20, requires_grad=True, device=device) + activation_quantizer(x).sum().backward() + actual_ma = activation_quantizer.moving_avg_module.moving_average + # scalars should be memorized from train and not updated + expected_ma = torch.tensor( + _compute_moving_average_closed_form(9, alpha), device=device + ).expand_as(actual_ma) + assert torch.allclose(expected_ma, actual_ma) + + +def test_moving_average_eval_only(): + """Test moving average option with eval_only mode set in activation quantizer.""" + alpha = 0.9 + + devices = [torch.device('cpu')] + if torch.cuda.is_available(): + devices.append(torch.device('cuda:0')) + + for device in devices: + activation_quantizer = ActivationQuantizerLS1('eval_only', alpha) + activation_quantizer.to(device) + activation_quantizer.train() + for i in range(10): + x = i * torch.ones(8, 1, 20, 20, requires_grad=True, device=device) + x_q = activation_quantizer(x) + x_q.sum().backward() + + # Moving average internal statistics should be updated + actual_ma = activation_quantizer.moving_avg_module.moving_average + ma_i = _compute_moving_average_closed_form(i, alpha) + expected_ma = torch.tensor(ma_i, device=device).expand_as(actual_ma) + assert torch.allclose(expected_ma, actual_ma) + + # Quantization should NOT be computed from moving average scalars + assert torch.allclose(x, x_q) + + activation_quantizer.eval() + for i in range(5): + x = i * torch.ones(8, 1, 20, 20, requires_grad=True, device=device) + activation_quantizer(x).sum().backward() + actual_ma = activation_quantizer.moving_avg_module.moving_average + # scalars should be memorized from train and not updated + expected_ma = torch.tensor( + _compute_moving_average_closed_form(9, alpha), device=device + ).expand_as(actual_ma) + assert torch.allclose(expected_ma, actual_ma) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason='requires >= 2 GPUs to run') +def test_moving_average_eval_only_multi_gpu(): + """Test moving average option with eval_only mode set in activation quantizer, with 2 GPUs.""" + alpha = 0.9 + activation_quantizer = ActivationQuantizerLS1('eval_only', alpha) + + activation_quantizer = nn.DataParallel(activation_quantizer, device_ids=[0, 1]) + device = torch.device('cuda:0') + activation_quantizer.to(device) + + activation_quantizer.train() + for i in range(10): + x_gpu0 = i * torch.ones(8, 1, 20, 20, requires_grad=True, device=device) + x_gpu1 = 42 * torch.ones(8, 1, 20, 20, requires_grad=True, device=device) + x = torch.cat([x_gpu0, x_gpu1], dim=0) + x_q = activation_quantizer(x) + x_q.sum().backward() + + # Moving average internal statistics should be updated + actual_ma = activation_quantizer.module.moving_avg_module.moving_average + ma_i = _compute_moving_average_closed_form(i, alpha) + expected_ma = torch.tensor(ma_i, device=device).expand_as(actual_ma) + assert torch.allclose(expected_ma, actual_ma) + + # Quantization should NOT be computed from moving average scalars + assert torch.allclose(x, x_q) + + activation_quantizer.eval() + for i in range(5): + x = 42 * torch.ones(16, 1, 20, 20, requires_grad=True, device=device) + x_q = activation_quantizer(x) + x_q.sum().backward() + actual_ma = activation_quantizer.module.moving_avg_module.moving_average + + # scalars should be memorized from train and not updated + ma_i = _compute_moving_average_closed_form(9, alpha) + expected_ma = torch.tensor(ma_i, device=device).expand_as(actual_ma) + assert torch.allclose(expected_ma, actual_ma) + + # Quantization should be using the moving average scalar from the 1st GPU during training + _, expected = quantizer_ls_1(x, torch.tensor([ma_i], device=device).expand(16)) + assert torch.allclose(x_q, expected)