{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import pyarrow as pa\n", "import numpy as np\n", "import json\n", "from datetime import date, datetime" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "with_json = False" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def json_serial(obj):\n", " return int(obj.strftime(\"%s\"))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
recorddatestationAWNDPRCPSNOWSNWDTMAXTMAX_FTMINTMIN_F...WT22elevationgsn_flaghcn_crn_flagidlatitudelongitudenamestatewmo_id
020120101USS0006H19SNaN0.0NaN356.01.133.98-15.44.28...NaN2572.5NaNNaNNaN41.3300-106.5000NaNWY South Brush CreekNaN
120120101USC00390043NaN0.00.00.05.642.08-2.826.96...NaN512.1HCNNaNNaN43.4892-99.0631NaNSD ACADEMY 2NENaN
220120101SWE00138512NaN124.0NaN0.0NaNNaNNaNNaN...NaN90.0NaNNaNNaN58.450014.8900NaNVADSTENANaN
320120101USC00163807NaN0.0NaNNaN25.678.0817.262.96...NaN0.6NaNNaNNaN29.2414-89.9914NaNLA GRAND ISLENaN
420120101USC00163800NaN0.0NaNNaN20.669.0815.660.08...NaN16.8HCNNaNNaN30.4183-92.0442NaNLA GRAND COTEAUNaN
\n", "

5 rows × 41 columns

\n", "
" ], "text/plain": [ " recorddate station AWND PRCP SNOW SNWD TMAX TMAX_F TMIN \\\n", "0 20120101 USS0006H19S NaN 0.0 NaN 356.0 1.1 33.98 -15.4 \n", "1 20120101 USC00390043 NaN 0.0 0.0 0.0 5.6 42.08 -2.8 \n", "2 20120101 SWE00138512 NaN 124.0 NaN 0.0 NaN NaN NaN \n", "3 20120101 USC00163807 NaN 0.0 NaN NaN 25.6 78.08 17.2 \n", "4 20120101 USC00163800 NaN 0.0 NaN NaN 20.6 69.08 15.6 \n", "\n", " TMIN_F ... WT22 elevation gsn_flag hcn_crn_flag id latitude \\\n", "0 4.28 ... NaN 2572.5 NaN NaN NaN 41.3300 \n", "1 26.96 ... NaN 512.1 HCN NaN NaN 43.4892 \n", "2 NaN ... NaN 90.0 NaN NaN NaN 58.4500 \n", "3 62.96 ... NaN 0.6 NaN NaN NaN 29.2414 \n", "4 60.08 ... NaN 16.8 HCN NaN NaN 30.4183 \n", "\n", " longitude name state wmo_id \n", "0 -106.5000 NaN WY South Brush Creek NaN \n", "1 -99.0631 NaN SD ACADEMY 2NE NaN \n", "2 14.8900 NaN VADSTENA NaN \n", "3 -89.9914 NaN LA GRAND ISLE NaN \n", "4 -92.0442 NaN LA GRAND COTEAU NaN \n", "\n", "[5 rows x 41 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('weather-10m.csv', encoding='utf-8')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['recorddate', 'station', 'AWND', 'PRCP', 'SNOW', 'SNWD', 'TMAX',\n", " 'TMAX_F', 'TMIN', 'TMIN_F', 'WT01', 'WT02', 'WT03', 'WT04', 'WT05',\n", " 'WT06', 'WT07', 'WT08', 'WT09', 'WT10', 'WT11', 'WT12', 'WT13', 'WT14',\n", " 'WT15', 'WT16', 'WT17', 'WT18', 'WT19', 'WT20', 'WT21', 'WT22',\n", " 'elevation', 'gsn_flag', 'hcn_crn_flag', 'id', 'latitude', 'longitude',\n", " 'name', 'state', 'wmo_id'],\n", " dtype='object')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.columns" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "data = data.drop([\"SNOW\"], axis=1)\n", "renamed = data.rename(index=str, columns={\n", " \"TMAX\": \"TEMP_MAX\", \n", " \"TMIN\": \"TEMP_MIN\",\n", " \"PRCP\": \"PRECIPITATION\",\n", " \"elevation\": \"ELEVATION\", \n", " \"AWND\": \"WIND\", \n", " \"latitude\": \"LATITUDE\", \n", " \"longitude\": \"LONGITUDE\",\n", " \"SNWD\": \"SNOW\"\n", "})\n", "\n", "renamed['RECORD_DATE'] = pd.to_datetime(renamed.recorddate, format='%Y%m%d').dt.date\n", "\n", "renamed = renamed[[\"RECORD_DATE\", \"TEMP_MAX\", \"TEMP_MIN\", \"PRECIPITATION\", \"ELEVATION\", \"WIND\", \"SNOW\", \"LATITUDE\", \"LONGITUDE\"]]\n", "\n", "renamed['WIND'].fillna(0, inplace=True)\n", "renamed['SNOW'].fillna(0, inplace=True)\n", "\n", "renamed['WIND'] = renamed['WIND'] / 10\n", "# temperature is already corrected in the data\n", "# renamed['TEMP_MIN'] = renamed['TEMP_MIN'] / 10\n", "# renamed['TEMP_MAX'] = renamed['TEMP_MAX'] / 10\n", "renamed['PRECIPITATION'] = renamed['PRECIPITATION'] / 10\n", "\n", "cleaned = renamed.dropna()\n", "\n", "right_types = cleaned.astype({\n", " 'TEMP_MAX': 'float32',\n", " 'TEMP_MIN': 'float32',\n", " 'PRECIPITATION': 'float32',\n", " 'ELEVATION': 'float32',\n", " 'WIND': 'float32',\n", " 'SNOW': 'float32',\n", " 'LATITUDE': 'float32',\n", " 'LONGITUDE': 'float32'\n", "})" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RECORD_DATETEMP_MAXTEMP_MINPRECIPITATIONELEVATIONWINDSNOWLATITUDELONGITUDE
02012-01-011.1-15.40.02572.50.0356.041.3300-106.5000
12012-01-015.6-2.80.0512.10.00.043.4892-99.0631
32012-01-0125.617.20.00.60.00.029.2414-89.9914
42012-01-0120.615.60.016.80.00.030.4183-92.0442
52012-01-01-6.6-17.10.0581.00.090.049.2167-102.9667
\n", "
" ], "text/plain": [ " RECORD_DATE TEMP_MAX TEMP_MIN PRECIPITATION ELEVATION WIND SNOW \\\n", "0 2012-01-01 1.1 -15.4 0.0 2572.5 0.0 356.0 \n", "1 2012-01-01 5.6 -2.8 0.0 512.1 0.0 0.0 \n", "3 2012-01-01 25.6 17.2 0.0 0.6 0.0 0.0 \n", "4 2012-01-01 20.6 15.6 0.0 16.8 0.0 0.0 \n", "5 2012-01-01 -6.6 -17.1 0.0 581.0 0.0 90.0 \n", "\n", " LATITUDE LONGITUDE \n", "0 41.3300 -106.5000 \n", "1 43.4892 -99.0631 \n", "3 29.2414 -89.9914 \n", "4 30.4183 -92.0442 \n", "5 49.2167 -102.9667 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cleaned.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "weather-10k\n", "weather-200k\n", "weather-500k\n", "weather-1m\n", "weather-3m\n", "weather-10m\n" ] } ], "source": [ "for size, name in [(10000, 'weather-10k'), (200000, 'weather-200k'), (500000, 'weather-500k'), (1000000, 'weather-1m'), (3000000, 'weather-3m'), (10000000, 'weather-10m')]:\n", " print(name)\n", "\n", " smaller = right_types[:size]\n", " \n", " table = pa.Table.from_pandas(smaller)\n", " \n", " if with_json:\n", " d = {}\n", " for column in smaller.columns:\n", " d[column]=list(smaller[column])\n", "\n", " with open(f'{name}.json', 'w') as f:\n", " json.dump(d, f, default=json_serial, separators=(',', ':'))\n", "\n", " # table = table.column('ARRIVAL').cast(pa.TimestampValue, True)\n", "\n", " writer = pa.RecordBatchFileWriter(f'{name}.arrow', table.schema)\n", " writer.write(table)\n", " writer.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }