Skip to content

Commit

Permalink
first version
Browse files Browse the repository at this point in the history
  • Loading branch information
mhrstmnn committed Mar 3, 2024
0 parents commit 6d02be9
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 0 deletions.
16 changes: 16 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# https://EditorConfig.org
root = true

[*]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[*.py]
indent_size = 4

[*.md]
trim_trailing_whitespace = false
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Python
/venv/**
/__pycache__/**

# Scripts
/data/**
!/data/.gitkeep
/out/**
21 changes: 21 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 Michael Horstmann

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Parser and Converter for Apple Health Export

Scripts to parse and convert health data after exporting it from Apple's Health app

## Requirements

- A somewhat modern version of [Python](https://www.python.org)
- If you want to use it in the CSV script: [jq](https://jqlang.github.io/jq/)
- Shell scripts were written for the [Zsh](https://www.zsh.org)

## Installation

- Clone this repository from GitHub
- Then run: `./pip_install.sh`

## Usage

- First export all of your health data from Apple's Health app
- Then unpack the exported ZIP archive
- Copy the `Export.xml` file to the `data` directory
- Run one of the two scripts that start with `parse_and_convert`

## Helpful

- [Parsing Apple Health data](https://gist.github.com/hoffa/936db2bb85e134709cd263dd358ca309)
- [How to parse XML file exported from Apple iOS Health App []](https://blog.gwlab.page/how-to-parse-xml-file-exported-from-apple-ios-health-app-and-make-a-sleep-schedule-plot-using-60c652697c81)

## Contribute

If you find a bug, feel free to create an issue or a pull request
Empty file added data/.gitkeep
Empty file.
19 changes: 19 additions & 0 deletions globals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os


data_file_path = os.path.join('.', 'data', 'Export.xml')


def get_output_file_path(filename: str, suffix: str, subdirectory_name='') -> str:
if not subdirectory_name:
subdirectory_name = suffix
directory_path = os.path.join('.', 'out', subdirectory_name)
os.makedirs(directory_path, exist_ok=True)
return os.path.join(directory_path, filename + '.' + suffix)


def get_argparse_description(output_description: str) -> str:
return '\n'.join([
'this is one of two scripts to parse and convert health data after exporting it from Apple\'s Health app:',
f'this script parses an XML file ("{data_file_path}") and converts it into {output_description}'
])
85 changes: 85 additions & 0 deletions parse_and_convert_to_json_and_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env python3

import xml.etree.ElementTree as ET
import json
import subprocess
import argparse
import sys

import globals


def parse_health_export() -> tuple[list[dict], list[str]]:
records: list[dict] = []
record_types: list[str] = []

for _, elem in ET.iterparse(globals.data_file_path):
if elem.tag == 'Record':
records.append(elem.attrib)
if elem.attrib['type'] not in record_types:
record_types.append(elem.attrib['type'])

return records, record_types


def print_all_record_types(record_types: list[str]):
print(json.dumps(record_types, indent=4))


def write_all_records_json_file(records: list[dict]):
with open(globals.get_output_file_path('all_records', 'json', 'jq'), 'w') as json_file:
json_file.write(json.dumps(records, indent=4) + '\n')


def write_all_records_txt_file(records: list[dict]):
with open(globals.get_output_file_path('all_records', 'txt', 'jq'), 'w') as txt_file:
for record in records:
txt_file.write(json.dumps(record) + '\n')


def write_all_records_csv_file_with_jq():
cat_command = f'cat {globals.get_output_file_path('all_records', 'txt', 'jq')}'
jq_command = 'jq -r "[.type, .creationDate, .startDate, .endDate, .value, .unit, .device, .sourceName, .sourceVersion] | @csv"'
output_file_path = globals.get_output_file_path('all_records', 'csv', 'jq')
subprocess.run(f'{cat_command} | {jq_command} > {output_file_path}', shell=True)


def main() -> int:
parser = argparse.ArgumentParser(description=globals.get_argparse_description('JSON and CSV files'), formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-p', '--print-types', help='whether all record types should be printed', action='store_true')
parser.add_argument('-j', '--write-json', help='whether all records JSON file should be written', action='store_true')
parser.add_argument('-c', '--write-csv', help='whether all records CSV file should be written with jq', action='store_true')

args = parser.parse_args()
if not any(vars(args).values()):
parser.print_help()
return 0

cli_options = {
'print_all_record_types': args.print_types,
'write_all_records_json_file': args.write_json,
'write_all_records_csv_file': args.write_csv
}

records, record_types = parse_health_export()

if cli_options['print_all_record_types']:
print('All record types:\n')
print_all_record_types(record_types)
return 0

if cli_options['write_all_records_json_file']:
print('All records JSON file is being written …')
write_all_records_json_file(records)

if cli_options['write_all_records_csv_file']:
print('All records text file is being written …')
write_all_records_txt_file(records)
print('All records CSV file is being written with jq …')
write_all_records_csv_file_with_jq()

return 0


if __name__ == '__main__':
sys.exit(main())
143 changes: 143 additions & 0 deletions parse_and_convert_to_xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python3

import xml.etree.ElementTree as ET
import pandas as pd
import argparse
import sys

import globals


pd.options.mode.copy_on_write = True


def parse_health_export() -> tuple[pd.DataFrame, list[str]]:
tree = ET.parse(globals.data_file_path)
root = tree.getroot()

records = [element.attrib for element in root.iter('Record')]
records_df = pd.DataFrame(records)

date_columns = ['creationDate', 'startDate', 'endDate']
records_df[date_columns] = records_df[date_columns].apply(
lambda date_column: pd.to_datetime(date_column).dt.tz_localize(None)
)

record_types: list[str] = records_df['type'].unique().tolist()

return records_df, record_types


def to_snake_case(string: str) -> str:
return ''.join(['_' + char.lower() if char.isupper() else char for char in string]).removeprefix('_')


def type_identifier_to_name(type_identifier: str) -> str:
for prefix in ['HKQuantityTypeIdentifier', 'HKCategoryTypeIdentifier', 'HKDataType']:
type_identifier = type_identifier.removeprefix(prefix)
return to_snake_case(type_identifier)


def print_all_record_types(record_types: list[str]):
for count, record_type in enumerate(record_types):
print(str(count + 1) + ':', record_type, '->', type_identifier_to_name(record_type))


def write_all_records_excel_file(records_df: pd.DataFrame, rearranged=False):
filename = 'all_records'
if rearranged:
filename += '_rearranged'
file_path = globals.get_output_file_path(filename, 'xlsx')
print(f'Write all records Excel file to: "{file_path}"')
records_df.to_excel(file_path)


def write_blood_pressure_excel_file(rearranged_records_df: pd.DataFrame, reduce_output: bool):
blood_pressure_systolic_df = rearranged_records_df.query('type == "HKQuantityTypeIdentifierBloodPressureSystolic"')
if reduce_output:
blood_pressure_systolic_df.drop(columns=['type', 'unit'], inplace=True)
else:
blood_pressure_systolic_df.drop(columns=['type', 'unit', 'device', 'sourceName', 'sourceVersion'], inplace=True)

blood_pressure_diastolic_df = rearranged_records_df.query(f'type == "HKQuantityTypeIdentifierBloodPressureDiastolic"')
blood_pressure_diastolic_df.drop(columns=['type', 'startDate', 'endDate'], inplace=True)

merged_blood_pressure_df = pd.merge(blood_pressure_systolic_df, blood_pressure_diastolic_df, on='creationDate')
merged_blood_pressure_df.rename(columns={'value_x': 'valueSystolic', 'value_y': 'valueDiastolic'}, inplace=True)

file_path = globals.get_output_file_path('blood_pressure', 'xlsx')
print(f'Write blood pressure Excel file to: "{file_path}"')
merged_blood_pressure_df.to_excel(file_path)


def write_all_other_excel_files(record_types: list[str], rearranged_records_df: pd.DataFrame):
# Remove blood pressure record types
for record_type in ['HKQuantityTypeIdentifierBloodPressureSystolic', 'HKQuantityTypeIdentifierBloodPressureDiastolic']:
if record_type in record_types:
record_types.remove(record_type)

for record_type in record_types:
filtered_records_df = rearranged_records_df.query(f'type == "{record_type}"')
filtered_records_df.drop(columns='type', inplace=True)

if record_type.startswith('HKCategoryTypeIdentifier'):
filtered_records_df.drop(columns='unit', inplace=True)

record_name = type_identifier_to_name(record_type)
file_path = globals.get_output_file_path(record_name, 'xlsx')
print(f'Write {record_name.replace('_', ' ')} Excel file to: "{file_path}"')
filtered_records_df.to_excel(file_path)


def main() -> int:
parser = argparse.ArgumentParser(description=globals.get_argparse_description('Excel files'), formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-p', '--print-types', help='whether all record types should be printed', action='store_true')
parser.add_argument('-o', '--one-file', help='whether all records should be written to one Excel file', action='store_true')
parser.add_argument('-s', '--separate-files', help='whether all records should be written to separate Excel files', action='store_true')
parser.add_argument('-r', '--reduce-output', help='whether to reduce output when writing Excel files', action='store_true')

args = parser.parse_args()
if not any(vars(args).values()):
parser.print_help()
return 0

cli_options = {
'print_all_record_types': args.print_types,
'write_all_records_excel_file': args.one_file,
'write_all_other_excel_files': args.separate_files,
'reduce_output': args.reduce_output
}

if sum(vars(args).values()) == 1 and cli_options['reduce_output']:
print('info: the option to reduce output has no effect if no Excel files are being written')
return 0

records_df, record_types = parse_health_export()

if cli_options['print_all_record_types']:
print('All record types:\n')
print_all_record_types(record_types)
return 0

if cli_options['reduce_output']:
rearranged_records_df = records_df[['type', 'creationDate', 'startDate', 'endDate', 'value', 'unit']]
else:
rearranged_records_df = records_df[['type', 'creationDate', 'startDate', 'endDate', 'value', 'unit', 'device', 'sourceName', 'sourceVersion']]

if cli_options['write_all_records_excel_file']:
print('All records Excel files are being written:\n')
write_all_records_excel_file(records_df)
write_all_records_excel_file(rearranged_records_df, True)
if cli_options['write_all_other_excel_files']:
print('\n')

if cli_options['write_all_other_excel_files']:
print('All other Excel files are being written:\n')
write_blood_pressure_excel_file(rearranged_records_df, cli_options['reduce_output'])
write_all_other_excel_files(record_types, rearranged_records_df)

return 0


if __name__ == '__main__':
sys.exit(main())
5 changes: 5 additions & 0 deletions pip_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/zsh

python3 -m venv venv
source ./venv/bin/activate
python3 -m pip install -r requirements.txt
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pandas
openpyxl

0 comments on commit 6d02be9

Please sign in to comment.