From 95baacec3c4783b0ed4c2162f72faab7717a89f5 Mon Sep 17 00:00:00 2001 From: isaacmasgrau <112957591+isaacmasgrau@users.noreply.github.com> Date: Tue, 11 Oct 2022 16:09:54 +0200 Subject: [PATCH] Add files via upload --- GreatExpectations.ipynb | 266 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 GreatExpectations.ipynb diff --git a/GreatExpectations.ipynb b/GreatExpectations.ipynb new file mode 100644 index 0000000..9580967 --- /dev/null +++ b/GreatExpectations.ipynb @@ -0,0 +1,266 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "5dvEVFgjKicZ" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import great_expectations as ge" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 523 + }, + "id": "_Jig0D7XLBy3", + "outputId": "60b19da3-7833-4350-ab3c-001e9d8b7287" + }, + "outputs": [], + "source": [ + "train_path = \"training.1600000.processed.noemoticon.csv\"\n", + "\n", + "# Reading the dataset with no columns titles and with latin encoding \n", + "df = pd.read_csv(train_path, sep = \",\", encoding='latin-1', header=None, error_bad_lines=False)\n", + "\n", + "# As the data has no column titles, we will add our own\n", + "df.columns = [\"target\", \"ids\", \"date\", \"flag\", \"user\", \"text\"]\n", + "\n", + "# Convert the dataframe to a Great Expectations dataset\n", + "df = ge.dataset.PandasDataset(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "KlevuevrKvLA" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'success': True,\n", + " 'result': {'observed_value': ['target',\n", + " 'ids',\n", + " 'date',\n", + " 'flag',\n", + " 'user',\n", + " 'text']}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Presence of specific features\n", + "df.expect_table_columns_to_match_ordered_list(column_list=[\"target\", \"ids\", \"date\", \"flag\", \"user\", \"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "1DCBlXwKKs-o" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'success': True,\n", + " 'result': {'element_count': 1600000,\n", + " 'missing_count': 0,\n", + " 'missing_percent': 0.0,\n", + " 'unexpected_count': 0,\n", + " 'unexpected_percent': 0.0,\n", + " 'partial_unexpected_list': []}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Missing values\n", + "df.expect_column_values_to_not_be_null(column=\"target\")\n", + "df.expect_column_values_to_not_be_null(column=\"ids\")\n", + "df.expect_column_values_to_not_be_null(column=\"date\")\n", + "df.expect_column_values_to_not_be_null(column=\"flag\")\n", + "df.expect_column_values_to_not_be_null(column=\"user\")\n", + "df.expect_column_values_to_not_be_null(column=\"text\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "jHI0vWtGKqg1" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'success': False,\n", + " 'result': {'element_count': 1600000,\n", + " 'missing_count': 0,\n", + " 'missing_percent': 0.0,\n", + " 'unexpected_count': 3370,\n", + " 'unexpected_percent': 0.00210625,\n", + " 'unexpected_percent_nonmissing': 0.00210625,\n", + " 'partial_unexpected_list': [1467863684,\n", + " 1467880442,\n", + " 1468053611,\n", + " 1468100580,\n", + " 1468115720,\n", + " 1468131748,\n", + " 1468161883,\n", + " 1468224250,\n", + " 1468310350,\n", + " 1468338634,\n", + " 1468363676,\n", + " 1468502040,\n", + " 1468503801,\n", + " 1468544973,\n", + " 1468586841,\n", + " 1468639063,\n", + " 1468652839,\n", + " 1468714181,\n", + " 1468758512,\n", + " 1468833927]}}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Unique values\n", + "df.expect_column_values_to_be_unique(column=\"ids\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "oZcJnpoEKoX_" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'success': True, 'result': {'observed_value': 'int64'}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Type adherence\n", + "df.expect_column_values_to_be_of_type(column=\"user\", type_=\"str\")\n", + "df.expect_column_values_to_be_of_type(column=\"text\", type_=\"str\")\n", + "df.expect_column_values_to_be_of_type(column=\"target\", type_=\"int64\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "UrJ4_A2rKk1W" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'results': [{'success': False,\n", + " 'result': {'element_count': 1600000,\n", + " 'missing_count': 0,\n", + " 'missing_percent': 0.0,\n", + " 'unexpected_count': 3370,\n", + " 'unexpected_percent': 0.00210625,\n", + " 'unexpected_percent_nonmissing': 0.00210625,\n", + " 'partial_unexpected_list': [1467863684,\n", + " 1467880442,\n", + " 1468053611,\n", + " 1468100580,\n", + " 1468115720,\n", + " 1468131748,\n", + " 1468161883,\n", + " 1468224250,\n", + " 1468310350,\n", + " 1468338634,\n", + " 1468363676,\n", + " 1468502040,\n", + " 1468503801,\n", + " 1468544973,\n", + " 1468586841,\n", + " 1468639063,\n", + " 1468652839,\n", + " 1468714181,\n", + " 1468758512,\n", + " 1468833927]},\n", + " 'exception_info': {'raised_exception': False,\n", + " 'exception_message': None,\n", + " 'exception_traceback': None},\n", + " 'expectation_config': {'expectation_type': 'expect_column_values_to_be_unique',\n", + " 'kwargs': {'column': 'ids'}}}],\n", + " 'success': False,\n", + " 'statistics': {'evaluated_expectations': 11,\n", + " 'successful_expectations': 10,\n", + " 'unsuccessful_expectations': 1,\n", + " 'success_percent': 90.9090909090909},\n", + " 'meta': {'great_expectations.__version__': '0.7.6',\n", + " 'data_asset_name': None,\n", + " 'expectation_suite_name': 'default',\n", + " 'run_id': '2022-10-11T135131.194037Z'}}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Expectation suite\n", + "expectation_suite = df.get_expectation_suite(discard_failed_expectations=False)\n", + "df.validate(expectation_suite=expectation_suite,only_return_failures=True)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}