Skip to content

Commit

Permalink
make csv file
Browse files Browse the repository at this point in the history
  • Loading branch information
rezwanh001 committed Jul 18, 2022
1 parent fc10252 commit dbc90b7
Showing 1 changed file with 181 additions and 0 deletions.
181 changes: 181 additions & 0 deletions make_csv_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 7275/7275 [06:37<00:00, 18.30it/s]\n",
"100%|██████████| 7500/7500 [07:13<00:00, 17.32it/s] \n"
]
}
],
"source": [
"# Import the modules\n",
"import re \n",
"import json\n",
"import glob\n",
"import csv \n",
"import os \n",
"from tqdm import tqdm\n",
"\n",
"src_dir = \"/media/rezwan/Study/Bengali_AI/hadith/hadith/\"\n",
"dest_csv_dir = \"/media/rezwan/Study/Bengali_AI/hadith/hadith/bukhari_muslim.csv\"\n",
"hadith_books = ['bukhari', 'muslim']\n",
"\n",
"## create list\n",
"_id = [] \n",
"_src_book= []\n",
"_chp_no = []\n",
"_hadith_no = []\n",
"_narrator = []\n",
"_validity = []\n",
"_ref = []\n",
"_text_bn = []\n",
"_text_ar = []\n",
"_explanation = []\n",
"_extra_note = []\n",
"_preface = []\n",
"\n",
"id_num = 0\n",
"\n",
"for hadith in hadith_books:\n",
" read_json_dir = os.path.join(src_dir, hadith)\n",
" \n",
" # which hadith book\n",
" if hadith == 'bukhari':\n",
" hadith_book_name = 'সহিহ বুখারী'\n",
" elif hadith == 'muslim':\n",
" hadith_book_name = 'সহিহ মুসলিম'\n",
"\n",
" for file in tqdm(sorted(glob.glob(os.path.join(read_json_dir, \"*\")))):\n",
" # Opening JSON file\n",
" f = open(file)\n",
" # Convert JSON string to dictionary\n",
" data = json.load(f)\n",
"\n",
" text_ar = data['data']['arabic']\n",
" text_bn = data['data']['bangla']\n",
" explanation = data['data']['explanation']\n",
" extra_note = data['data']['extra-note']\n",
" narrator = data['data']['narrator']\n",
" validity = data['data']['validity']\n",
"\n",
" chapter_no = data['meta']['header']\n",
" preface = data['meta']['preface']\n",
"\n",
" if preface != None:\n",
" preface = \" \".join(data['meta']['preface'])\n",
"\n",
" translator = None\n",
" ref = None \n",
" if text_bn != None:\n",
" # \"(?<!\\.)\\n\\n\"\n",
" newline_pattern=u'(?<!\\.)\\n\\n' #\\n\\n\n",
" translator_ref=re.split(newline_pattern, text_bn)\n",
" if len(translator_ref) > 1:\n",
" ref = translator_ref[-1]\n",
" translator = \"\".join(translator_ref[0:-1])\n",
"\n",
" _id.append(id_num)\n",
" id_num += 1\n",
" _src_book.append(hadith_book_name)\n",
" _chp_no.append(chapter_no)\n",
"\n",
" jsno_file = file.split(\"/\")[-1]\n",
" hadith_no = jsno_file.split(\".\")[0]\n",
" _hadith_no.append(int(hadith_no))\n",
"\n",
" _narrator.append(narrator)\n",
" _validity.append(validity)\n",
" _ref.append(ref)\n",
" _text_bn.append(translator)\n",
" _text_ar.append(text_ar)\n",
" _explanation.append(explanation)\n",
" _extra_note.append(extra_note)\n",
" _preface.append(preface)\n",
" \n",
"## Create dictionary\n",
"df_dict = {}\n",
"\n",
"df_dict['id'] = _id\n",
"df_dict['source_book'] = _src_book\n",
"df_dict['chapter_no'] = _chp_no\n",
"df_dict['hadith_no'] = _hadith_no\n",
"df_dict['narrator'] = _narrator\n",
"df_dict['validity'] = _validity\n",
"\n",
"df_dict['chain_idx'] = _ref\n",
"df_dict['text_bn'] = _text_bn\n",
"df_dict['text_ar'] = _text_ar\n",
"df_dict['explanation'] = _explanation\n",
"df_dict['extra_note'] = _extra_note\n",
"df_dict['preface'] = _preface\n",
"\n",
"# create a csv file \n",
"with open(dest_csv_dir, \"w\") as out_file:\n",
" csv_file = csv.writer(out_file)\n",
" csv_file.writerow(df_dict.keys())\n",
" csv_file.writerows(zip(*df_dict.values()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"### json data format\n",
"\n",
"'''\n",
"{\n",
" \"data\": {\n",
" \"arabic\": \"\",\n",
" \"bangla\": \"\",\n",
" \"explanation\": \"\",\n",
" \"extra-note\": null,\n",
" \"narrator\": \"\",\n",
" \"validity\": \"\"\n",
" },\n",
" \"meta\": {\n",
" \"header\": \"\",\n",
" \"preface\": [\n",
" \"\",\n",
" \"\"\n",
" ]\n",
" }\n",
"}\n",
"'''"
]
}
],
"metadata": {
"interpreter": {
"hash": "d8bbe4fe62dd3d3fc281495773bee29eba95fa92a2e9ec6daef308576e4d9e58"
},
"kernelspec": {
"display_name": "Python 3.8.5 ('bengali_ai')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit dbc90b7

Please sign in to comment.