NLP Profiler: updated the source and notebooks with changes to explai…

…n the steps better
yml-blog · Jul 3, 2020 · 489342b · 489342b
1 parent 8a45c01
commit 489342b
Show file tree

Hide file tree

Showing 2 changed files with 103 additions and 64 deletions.
diff --git a/examples/better-nlp/library/org/neomatrix369/nlp_profiler.py b/examples/better-nlp/library/org/neomatrix369/nlp_profiler.py
@@ -1,3 +1,19 @@
+#!/bin/bash
+
+# Copyright 2020 Mani Sarkar
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from itertools import groupby
 
 import re
@@ -56,6 +72,8 @@ def sentiment_polarity(score):
 def sentiment_polarity_score(text):
 	return TextBlob(text).sentiment.polarity
 
+ ### See https://en.wikipedia.org/wiki/Words_of_estimative_probability
+
 subjectivity_words_of_probability_estimation = [
     ["Very subjective", 99, 100],  # Certain: 100%: Give or take 0%
     ### The General Area of Possibility

diff --git a/examples/better-nlp/notebooks/jupyter/nlp_profiler.ipynb b/examples/better-nlp/notebooks/jupyter/nlp_profiler.ipynb
@@ -273,9 +273,90 @@
     "text_dataframe"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pandas describe() function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>2833047 people live in this area. It is not a ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                     text\n",
+       "count                                                   7\n",
+       "unique                                                  7\n",
+       "top     2833047 people live in this area. It is not a ...\n",
+       "freq                                                    1"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text_dataframe.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## NLP profiler's equivalent to that"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -437,52 +518,6 @@
        "      <td>3</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>Todays date is 28/04/2020 and tomorrow's date ...</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>Neutral</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>Very objective</td>\n",
-       "      <td>0.748268</td>\n",
-       "      <td>Pretty good</td>\n",
-       "      <td>2</td>\n",
-       "      <td>60</td>\n",
-       "      <td>8</td>\n",
-       "      <td>8</td>\n",
-       "      <td>2</td>\n",
-       "      <td>52</td>\n",
-       "      <td>0</td>\n",
-       "      <td>6</td>\n",
-       "      <td>46</td>\n",
-       "      <td>14</td>\n",
-       "      <td>6</td>\n",
-       "      <td>3</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>Everyone here is so hardworking. Hardworking p...</td>\n",
-       "      <td>0.700000</td>\n",
-       "      <td>Positive</td>\n",
-       "      <td>0.60</td>\n",
-       "      <td>Objective/subjective</td>\n",
-       "      <td>0.857143</td>\n",
-       "      <td>Pretty good</td>\n",
-       "      <td>4</td>\n",
-       "      <td>112</td>\n",
-       "      <td>17</td>\n",
-       "      <td>18</td>\n",
-       "      <td>3</td>\n",
-       "      <td>95</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>92</td>\n",
-       "      <td>20</td>\n",
-       "      <td>3</td>\n",
-       "      <td>7</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
@@ -494,72 +529,58 @@
        "2         2833047 and 1111 people live in this area.   \n",
        "3  This sentence doesn't seem to too many commas,...   \n",
        "4  Todays date is 04/28/2020 for format mm/dd/yyy...   \n",
-       "5  Todays date is 28/04/2020 and tomorrow's date ...   \n",
-       "6  Everyone here is so hardworking. Hardworking p...   \n",
        "\n",
        "   sentiment_polarity_score sentiment_polarity  sentiment_subjectivity_score  \\\n",
        "0                  0.380000           Positive                          0.43   \n",
        "1                 -0.106818           Negative                          0.55   \n",
        "2                  0.136364           Positive                          0.50   \n",
        "3                  0.375000           Positive                          0.75   \n",
        "4                  0.000000            Neutral                          0.00   \n",
-       "5                  0.000000            Neutral                          0.00   \n",
-       "6                  0.700000           Positive                          0.60   \n",
        "\n",
        "  sentiment_subjectivity  spellcheck_score spelling_quality  sentences_count  \\\n",
        "0   Objective/subjective          1.000000             Good                2   \n",
        "1   Objective/subjective          0.968802       Quite good                3   \n",
        "2   Objective/subjective          1.000000             Good                2   \n",
        "3      Pretty subjective          0.923887       Quite good                2   \n",
        "4         Very objective          0.711513      Pretty good                2   \n",
-       "5         Very objective          0.748268      Pretty good                2   \n",
-       "6   Objective/subjective          0.857143      Pretty good                4   \n",
        "\n",
        "   characters_count  spaces_count  words_count  duplicates_count  \\\n",
        "0                21             5            4                 0   \n",
        "1                56            11           11                 2   \n",
        "2                42             7            6                 0   \n",
        "3                74            11           13                 0   \n",
        "4                64             8            9                 0   \n",
-       "5                60             8            8                 2   \n",
-       "6               112            17           18                 3   \n",
        "\n",
        "   chars_excl_spaces_count  emoji_count  whole_numbers_count  \\\n",
        "0                       16            2                    0   \n",
        "1                       45            0                    1   \n",
        "2                       35            0                    2   \n",
        "3                       63            0                    0   \n",
        "4                       56            0                    6   \n",
-       "5                       52            0                    6   \n",
-       "6                       95            0                    0   \n",
        "\n",
        "   alpha_numeric_count  non_alpha_numeric_count  punctuations_count  \\\n",
        "0                   13                        8                   1   \n",
        "1                   43                       13                   2   \n",
        "2                   34                        8                   1   \n",
        "3                   56                       18                   7   \n",
        "4                   48                       16                   8   \n",
-       "5                   46                       14                   6   \n",
-       "6                   92                       20                   3   \n",
        "\n",
        "   stop_words_count  dates_count  \n",
        "0                 1            0  \n",
        "1                 5            0  \n",
        "2                 3            0  \n",
        "3                 4            0  \n",
-       "4                 3            1  \n",
-       "5                 3            2  \n",
-       "6                 7            0  "
+       "4                 3            1  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "profiled_text_dataframe = apply_text_profiling(text_dataframe, 'text')\n",
-    "profiled_text_dataframe"
+    "profiled_text_dataframe.head()"
    ]
   },
   {