first version

mhrstmnn · Mar 3, 2024 · 6d02be9 · 6d02be9
commit 6d02be9
Show file tree

Hide file tree

Showing 10 changed files with 329 additions and 0 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,16 @@
+# https://EditorConfig.org
+root = true
+
+[*]
+indent_style = space
+indent_size = 2
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.py]
+indent_size = 4
+
+[*.md]
+trim_trailing_whitespace = false
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+# Python
+/venv/**
+/__pycache__/**
+
+# Scripts
+/data/**
+!/data/.gitkeep
+/out/**
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Michael Horstmann
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,30 @@
+# Parser and Converter for Apple Health Export
+
+Scripts to parse and convert health data after exporting it from Apple's Health app
+
+## Requirements
+
+- A somewhat modern version of [Python](https://www.python.org)
+- If you want to use it in the CSV script: [jq](https://jqlang.github.io/jq/)
+- Shell scripts were written for the [Zsh](https://www.zsh.org)
+
+## Installation
+
+- Clone this repository from GitHub
+- Then run: `./pip_install.sh`
+
+## Usage
+
+- First export all of your health data from Apple's Health app
+- Then unpack the exported ZIP archive
+- Copy the `Export.xml` file to the `data` directory
+- Run one of the two scripts that start with `parse_and_convert`
+
+## Helpful
+
+- [Parsing Apple Health data](https://gist.github.com/hoffa/936db2bb85e134709cd263dd358ca309)
+- [How to parse XML file exported from Apple iOS Health App […]](https://blog.gwlab.page/how-to-parse-xml-file-exported-from-apple-ios-health-app-and-make-a-sleep-schedule-plot-using-60c652697c81)
+
+## Contribute
+
+If you find a bug, feel free to create an issue or a pull request
diff --git a/data/.gitkeep b/data/.gitkeep
diff --git a/globals.py b/globals.py
@@ -0,0 +1,19 @@
+import os
+
+
+data_file_path = os.path.join('.', 'data', 'Export.xml')
+
+
+def get_output_file_path(filename: str, suffix: str, subdirectory_name='') -> str:
+    if not subdirectory_name:
+        subdirectory_name = suffix
+    directory_path = os.path.join('.', 'out', subdirectory_name)
+    os.makedirs(directory_path, exist_ok=True)
+    return os.path.join(directory_path, filename + '.' + suffix)
+
+
+def get_argparse_description(output_description: str) -> str:
+    return '\n'.join([
+        'this is one of two scripts to parse and convert health data after exporting it from Apple\'s Health app:',
+        f'this script parses an XML file ("{data_file_path}") and converts it into {output_description}'
+    ])
diff --git a/parse_and_convert_to_json_and_csv.py b/parse_and_convert_to_json_and_csv.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+import xml.etree.ElementTree as ET
+import json
+import subprocess
+import argparse
+import sys
+
+import globals
+
+
+def parse_health_export() -> tuple[list[dict], list[str]]:
+    records: list[dict] = []
+    record_types: list[str] = []
+
+    for _, elem in ET.iterparse(globals.data_file_path):
+        if elem.tag == 'Record':
+            records.append(elem.attrib)
+            if elem.attrib['type'] not in record_types:
+                record_types.append(elem.attrib['type'])
+
+    return records, record_types
+
+
+def print_all_record_types(record_types: list[str]):
+    print(json.dumps(record_types, indent=4))
+
+
+def write_all_records_json_file(records: list[dict]):
+    with open(globals.get_output_file_path('all_records', 'json', 'jq'), 'w') as json_file:
+        json_file.write(json.dumps(records, indent=4) + '\n')
+
+
+def write_all_records_txt_file(records: list[dict]):
+    with open(globals.get_output_file_path('all_records', 'txt', 'jq'), 'w') as txt_file:
+        for record in records:
+            txt_file.write(json.dumps(record) + '\n')
+
+
+def write_all_records_csv_file_with_jq():
+    cat_command = f'cat {globals.get_output_file_path('all_records', 'txt', 'jq')}'
+    jq_command = 'jq -r "[.type, .creationDate, .startDate, .endDate, .value, .unit, .device, .sourceName, .sourceVersion] | @csv"'
+    output_file_path = globals.get_output_file_path('all_records', 'csv', 'jq')
+    subprocess.run(f'{cat_command} | {jq_command} > {output_file_path}', shell=True)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=globals.get_argparse_description('JSON and CSV files'), formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-p', '--print-types', help='whether all record types should be printed', action='store_true')
+    parser.add_argument('-j', '--write-json', help='whether all records JSON file should be written', action='store_true')
+    parser.add_argument('-c', '--write-csv', help='whether all records CSV file should be written with jq', action='store_true')
+
+    args = parser.parse_args()
+    if not any(vars(args).values()):
+        parser.print_help()
+        return 0
+
+    cli_options = {
+        'print_all_record_types': args.print_types,
+        'write_all_records_json_file': args.write_json,
+        'write_all_records_csv_file': args.write_csv
+    }
+
+    records, record_types = parse_health_export()
+
+    if cli_options['print_all_record_types']:
+        print('All record types:\n')
+        print_all_record_types(record_types)
+        return 0
+
+    if cli_options['write_all_records_json_file']:
+        print('All records JSON file is being written …')
+        write_all_records_json_file(records)
+
+    if cli_options['write_all_records_csv_file']:
+        print('All records text file is being written …')
+        write_all_records_txt_file(records)
+        print('All records CSV file is being written with jq …')
+        write_all_records_csv_file_with_jq()
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/parse_and_convert_to_xlsx.py b/parse_and_convert_to_xlsx.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+import xml.etree.ElementTree as ET
+import pandas as pd
+import argparse
+import sys
+
+import globals
+
+
+pd.options.mode.copy_on_write = True
+
+
+def parse_health_export() -> tuple[pd.DataFrame, list[str]]:
+    tree = ET.parse(globals.data_file_path)
+    root = tree.getroot()
+
+    records = [element.attrib for element in root.iter('Record')]
+    records_df = pd.DataFrame(records)
+
+    date_columns = ['creationDate', 'startDate', 'endDate']
+    records_df[date_columns] = records_df[date_columns].apply(
+        lambda date_column: pd.to_datetime(date_column).dt.tz_localize(None)
+    )
+
+    record_types: list[str] = records_df['type'].unique().tolist()
+
+    return records_df, record_types
+
+
+def to_snake_case(string: str) -> str:
+    return ''.join(['_' + char.lower() if char.isupper() else char for char in string]).removeprefix('_')
+
+
+def type_identifier_to_name(type_identifier: str) -> str:
+    for prefix in ['HKQuantityTypeIdentifier', 'HKCategoryTypeIdentifier', 'HKDataType']:
+        type_identifier = type_identifier.removeprefix(prefix)
+    return to_snake_case(type_identifier)
+
+
+def print_all_record_types(record_types: list[str]):
+    for count, record_type in enumerate(record_types):
+        print(str(count + 1) + ':', record_type, '->', type_identifier_to_name(record_type))
+
+
+def write_all_records_excel_file(records_df: pd.DataFrame, rearranged=False):
+    filename = 'all_records'
+    if rearranged:
+        filename += '_rearranged'
+    file_path = globals.get_output_file_path(filename, 'xlsx')
+    print(f'Write all records Excel file to: "{file_path}"')
+    records_df.to_excel(file_path)
+
+
+def write_blood_pressure_excel_file(rearranged_records_df: pd.DataFrame, reduce_output: bool):
+    blood_pressure_systolic_df = rearranged_records_df.query('type == "HKQuantityTypeIdentifierBloodPressureSystolic"')
+    if reduce_output:
+        blood_pressure_systolic_df.drop(columns=['type', 'unit'], inplace=True)
+    else:
+        blood_pressure_systolic_df.drop(columns=['type', 'unit', 'device', 'sourceName', 'sourceVersion'], inplace=True)
+
+    blood_pressure_diastolic_df = rearranged_records_df.query(f'type == "HKQuantityTypeIdentifierBloodPressureDiastolic"')
+    blood_pressure_diastolic_df.drop(columns=['type', 'startDate', 'endDate'], inplace=True)
+
+    merged_blood_pressure_df = pd.merge(blood_pressure_systolic_df, blood_pressure_diastolic_df, on='creationDate')
+    merged_blood_pressure_df.rename(columns={'value_x': 'valueSystolic', 'value_y': 'valueDiastolic'}, inplace=True)
+
+    file_path = globals.get_output_file_path('blood_pressure', 'xlsx')
+    print(f'Write blood pressure Excel file to: "{file_path}"')
+    merged_blood_pressure_df.to_excel(file_path)
+
+
+def write_all_other_excel_files(record_types: list[str], rearranged_records_df: pd.DataFrame):
+    # Remove blood pressure record types
+    for record_type in ['HKQuantityTypeIdentifierBloodPressureSystolic', 'HKQuantityTypeIdentifierBloodPressureDiastolic']:
+        if record_type in record_types:
+            record_types.remove(record_type)
+
+    for record_type in record_types:
+        filtered_records_df = rearranged_records_df.query(f'type == "{record_type}"')
+        filtered_records_df.drop(columns='type', inplace=True)
+
+        if record_type.startswith('HKCategoryTypeIdentifier'):
+            filtered_records_df.drop(columns='unit', inplace=True)
+
+        record_name = type_identifier_to_name(record_type)
+        file_path = globals.get_output_file_path(record_name, 'xlsx')
+        print(f'Write {record_name.replace('_', ' ')} Excel file to: "{file_path}"')
+        filtered_records_df.to_excel(file_path)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=globals.get_argparse_description('Excel files'), formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-p', '--print-types', help='whether all record types should be printed', action='store_true')
+    parser.add_argument('-o', '--one-file', help='whether all records should be written to one Excel file', action='store_true')
+    parser.add_argument('-s', '--separate-files', help='whether all records should be written to separate Excel files', action='store_true')
+    parser.add_argument('-r', '--reduce-output', help='whether to reduce output when writing Excel files', action='store_true')
+
+    args = parser.parse_args()
+    if not any(vars(args).values()):
+        parser.print_help()
+        return 0
+
+    cli_options = {
+        'print_all_record_types': args.print_types,
+        'write_all_records_excel_file': args.one_file,
+        'write_all_other_excel_files': args.separate_files,
+        'reduce_output': args.reduce_output
+    }
+
+    if sum(vars(args).values()) == 1 and cli_options['reduce_output']:
+        print('info: the option to reduce output has no effect if no Excel files are being written')
+        return 0
+
+    records_df, record_types = parse_health_export()
+
+    if cli_options['print_all_record_types']:
+        print('All record types:\n')
+        print_all_record_types(record_types)
+        return 0
+
+    if cli_options['reduce_output']:
+        rearranged_records_df = records_df[['type', 'creationDate', 'startDate', 'endDate', 'value', 'unit']]
+    else:
+        rearranged_records_df = records_df[['type', 'creationDate', 'startDate', 'endDate', 'value', 'unit', 'device', 'sourceName', 'sourceVersion']]
+
+    if cli_options['write_all_records_excel_file']:
+        print('All records Excel files are being written:\n')
+        write_all_records_excel_file(records_df)
+        write_all_records_excel_file(rearranged_records_df, True)
+        if cli_options['write_all_other_excel_files']:
+            print('\n')
+
+    if cli_options['write_all_other_excel_files']:
+        print('All other Excel files are being written:\n')
+        write_blood_pressure_excel_file(rearranged_records_df, cli_options['reduce_output'])
+        write_all_other_excel_files(record_types, rearranged_records_df)
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/pip_install.sh b/pip_install.sh
@@ -0,0 +1,5 @@
+#!/bin/zsh
+
+python3 -m venv venv
+source ./venv/bin/activate
+python3 -m pip install -r requirements.txt
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+openpyxl