make csv file

mnansary · Jul 18, 2022 · dbc90b7 · dbc90b7
1 parent fc10252
commit dbc90b7
Showing 1 changed file with 181 additions and 0 deletions.
diff --git a/make_csv_data.ipynb b/make_csv_data.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 7275/7275 [06:37<00:00, 18.30it/s]\n",
+      "100%|██████████| 7500/7500 [07:13<00:00, 17.32it/s] \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Import the modules\n",
+    "import re \n",
+    "import json\n",
+    "import glob\n",
+    "import csv \n",
+    "import os \n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "src_dir = \"/media/rezwan/Study/Bengali_AI/hadith/hadith/\"\n",
+    "dest_csv_dir = \"/media/rezwan/Study/Bengali_AI/hadith/hadith/bukhari_muslim.csv\"\n",
+    "hadith_books = ['bukhari', 'muslim']\n",
+    "\n",
+    "## create list\n",
+    "_id = [] \n",
+    "_src_book= []\n",
+    "_chp_no = []\n",
+    "_hadith_no = []\n",
+    "_narrator = []\n",
+    "_validity = []\n",
+    "_ref = []\n",
+    "_text_bn = []\n",
+    "_text_ar = []\n",
+    "_explanation = []\n",
+    "_extra_note = []\n",
+    "_preface = []\n",
+    "\n",
+    "id_num = 0\n",
+    "\n",
+    "for hadith in hadith_books:\n",
+    "    read_json_dir = os.path.join(src_dir, hadith)\n",
+    "    \n",
+    "    # which hadith book\n",
+    "    if hadith == 'bukhari':\n",
+    "        hadith_book_name = 'সহিহ বুখারী'\n",
+    "    elif hadith == 'muslim':\n",
+    "        hadith_book_name = 'সহিহ মুসলিম'\n",
+    "\n",
+    "    for file in tqdm(sorted(glob.glob(os.path.join(read_json_dir, \"*\")))):\n",
+    "        # Opening JSON file\n",
+    "        f = open(file)\n",
+    "        # Convert JSON string to dictionary\n",
+    "        data = json.load(f)\n",
+    "\n",
+    "        text_ar = data['data']['arabic']\n",
+    "        text_bn = data['data']['bangla']\n",
+    "        explanation = data['data']['explanation']\n",
+    "        extra_note = data['data']['extra-note']\n",
+    "        narrator = data['data']['narrator']\n",
+    "        validity = data['data']['validity']\n",
+    "\n",
+    "        chapter_no = data['meta']['header']\n",
+    "        preface = data['meta']['preface']\n",
+    "\n",
+    "        if preface != None:\n",
+    "            preface = \" \".join(data['meta']['preface'])\n",
+    "\n",
+    "        translator = None\n",
+    "        ref = None \n",
+    "        if text_bn != None:\n",
+    "            # \"(?<!\\.)\\n\\n\"\n",
+    "            newline_pattern=u'(?<!\\.)\\n\\n' #\\n\\n\n",
+    "            translator_ref=re.split(newline_pattern, text_bn)\n",
+    "            if len(translator_ref) > 1:\n",
+    "                ref = translator_ref[-1]\n",
+    "                translator = \"\".join(translator_ref[0:-1])\n",
+    "\n",
+    "        _id.append(id_num)\n",
+    "        id_num += 1\n",
+    "        _src_book.append(hadith_book_name)\n",
+    "        _chp_no.append(chapter_no)\n",
+    "\n",
+    "        jsno_file = file.split(\"/\")[-1]\n",
+    "        hadith_no = jsno_file.split(\".\")[0]\n",
+    "        _hadith_no.append(int(hadith_no))\n",
+    "\n",
+    "        _narrator.append(narrator)\n",
+    "        _validity.append(validity)\n",
+    "        _ref.append(ref)\n",
+    "        _text_bn.append(translator)\n",
+    "        _text_ar.append(text_ar)\n",
+    "        _explanation.append(explanation)\n",
+    "        _extra_note.append(extra_note)\n",
+    "        _preface.append(preface)\n",
+    "        \n",
+    "## Create dictionary\n",
+    "df_dict = {}\n",
+    "\n",
+    "df_dict['id'] = _id\n",
+    "df_dict['source_book'] = _src_book\n",
+    "df_dict['chapter_no'] = _chp_no\n",
+    "df_dict['hadith_no'] = _hadith_no\n",
+    "df_dict['narrator'] = _narrator\n",
+    "df_dict['validity'] = _validity\n",
+    "\n",
+    "df_dict['chain_idx'] = _ref\n",
+    "df_dict['text_bn'] = _text_bn\n",
+    "df_dict['text_ar'] = _text_ar\n",
+    "df_dict['explanation'] = _explanation\n",
+    "df_dict['extra_note'] = _extra_note\n",
+    "df_dict['preface'] = _preface\n",
+    "\n",
+    "# create a csv file  \n",
+    "with open(dest_csv_dir, \"w\") as out_file:\n",
+    "    csv_file = csv.writer(out_file)\n",
+    "    csv_file.writerow(df_dict.keys())\n",
+    "    csv_file.writerows(zip(*df_dict.values()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### json data format\n",
+    "\n",
+    "'''\n",
+    "{\n",
+    "    \"data\": {\n",
+    "        \"arabic\": \"\",\n",
+    "        \"bangla\": \"\",\n",
+    "        \"explanation\": \"\",\n",
+    "        \"extra-note\": null,\n",
+    "        \"narrator\": \"\",\n",
+    "        \"validity\": \"\"\n",
+    "    },\n",
+    "    \"meta\": {\n",
+    "        \"header\": \"\",\n",
+    "        \"preface\": [\n",
+    "            \"\",\n",
+    "            \"\"\n",
+    "        ]\n",
+    "    }\n",
+    "}\n",
+    "'''"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "d8bbe4fe62dd3d3fc281495773bee29eba95fa92a2e9ec6daef308576e4d9e58"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.5 ('bengali_ai')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}