From 80460838b4d01f5ed2e284c7db79b8c0e1a5604b Mon Sep 17 00:00:00 2001 From: Yukuo Cen Date: Sun, 2 Jun 2019 17:37:30 +0800 Subject: [PATCH] Add README.md & Refactor --- .gitignore | 129 + README.md | 86 + example_data/feature.txt | 521 ++++ example_data/test.txt | 794 +++++ example_data/train.txt | 3475 ++++++++++++++++++++++ example_data/valid.txt | 398 +++ requirements.txt | 6 + scripts/download_preprocessed_dataset.py | 119 + scripts/run_example.sh | 1 + main.py => src/main.py | 16 +- utils.py => src/utils.py | 2 - walk.py => src/walk.py | 0 12 files changed, 5537 insertions(+), 10 deletions(-) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 example_data/feature.txt create mode 100644 example_data/test.txt create mode 100644 example_data/train.txt create mode 100644 example_data/valid.txt create mode 100644 requirements.txt create mode 100644 scripts/download_preprocessed_dataset.py create mode 100755 scripts/run_example.sh rename main.py => src/main.py (96%) rename utils.py => src/utils.py (96%) rename walk.py => src/walk.py (100%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4da085b --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.vscode +data/ +runs/ +src/__pycache__ diff --git a/README.md b/README.md new file mode 100644 index 0000000..3764e9a --- /dev/null +++ b/README.md @@ -0,0 +1,86 @@ +# GATNE + +### [Project](https://sites.google.com/view/gatne) | [Arxiv](https://arxiv.org/abs/1905.01669) + +Representation Learning for Attributed Multiplex Heterogeneous Network. + +[Yukuo Cen](https://sites.google.com/view/yukuocen), Xu Zou, Jianwei Zhang, [Hongxia Yang](https://sites.google.com/site/hystatistics/home), [Jingren Zhou](http://www.cs.columbia.edu/~jrzhou/), [Jie Tang](http://keg.cs.tsinghua.edu.cn/jietang/) + +Accepted to KDD 2019 Research Track! + +## Prerequisites + +- Linux or macOS +- Python 3 +- TensorFlow >= 1.8 +- NVIDIA GPU + CUDA cuDNN + +## Getting Started + +### Installation + +Clone this repo. + +```bash +git clone https://github.com/THUDM/GATNE +cd GATNE +``` + +Please install dependencies by + +```bash +pip install -r requirements.txt +``` + +### Dataset + +These datasets are sampled from the original datasets. + +- Amazon contains 10,166 nodes and 148,865 edges. [Source](http://jmcauley.ucsd.edu/data/amazon) +- Twitter contains 10,000 nodes and 331,899 edges. [Source](https://snap.stanford.edu/data/higgs-twitter.html) +- YouTube contains 2,000 nodes and 1,310,617 edges. [Source](http://socialcomputing.asu.edu/datasets/YouTube) +- Alibaba contains 6,163 nodes and 17,865 edges. + +You can download the preprocessed datasets by running `python scripts/download_preprocessed_data.py`. (Alibaba dataset is to be released.) +If you're in regions where Dropbox are blocked (e.g. Mainland China), try `python scripts/download_preprocessed_data.py --cn`. + +### Training + +#### Training on the existing datasets + +You can use `./scripts/run_example.sh` or `python src/main.py --input example_data` to train GATNE-T model on the example data. (If you share the server with others or you want to use the specific GPU(s), you may need to set `CUDA_VISIBLE_DEVICES`.) + +If you want to train on the Amazon dataset, you can run `python src/main.py --input data/amazon` or `python src/main.py --input data/amazon --features data/feature.txt` to train GATNE-T model or GATNE-I model, respectively. + +You can use the following commands to train GATNE-T on Twitter and YouTube datasets. We only evaluate the edges of the first edge type on Twitter dataset as the number of edges of other edge types is too small. +`python src/main.py --input data/twitter --eval-type 1` +`python src/main.py --input data/youtube` + +As Twitter and YouTube datasets do not have node attributes, you can generate heuristic features for them, such as DeepWalk embeddings. Then you can train GATNE-I model on these two datasets by adding the `--features` argument. + +#### Training on your own datasets + +If you want to train GATNE-T/I on your own dataset, you should prepare the following three(or four) files: +- train.txt: Each line represents an edge, which contains three tokens ` ` where each token can be either a number or a string. +- valid.txt: Each line represents an edge or a non-edge, which contains four tokens `