Skip to content

Commit

Permalink
NLP Profiler: updated the source and notebooks with changes to explai…
Browse files Browse the repository at this point in the history
…n the steps better
  • Loading branch information
neomatrix369 committed Jul 3, 2020
1 parent 8a45c01 commit 489342b
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 64 deletions.
18 changes: 18 additions & 0 deletions examples/better-nlp/library/org/neomatrix369/nlp_profiler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
#!/bin/bash

# Copyright 2020 Mani Sarkar

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from itertools import groupby

import re
Expand Down Expand Up @@ -56,6 +72,8 @@ def sentiment_polarity(score):
def sentiment_polarity_score(text):
return TextBlob(text).sentiment.polarity

### See https://en.wikipedia.org/wiki/Words_of_estimative_probability

subjectivity_words_of_probability_estimation = [
["Very subjective", 99, 100], # Certain: 100%: Give or take 0%
### The General Area of Possibility
Expand Down
149 changes: 85 additions & 64 deletions examples/better-nlp/notebooks/jupyter/nlp_profiler.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,90 @@
"text_dataframe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pandas describe() function"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>2833047 people live in this area. It is not a ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text\n",
"count 7\n",
"unique 7\n",
"top 2833047 people live in this area. It is not a ...\n",
"freq 1"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_dataframe.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## NLP profiler's equivalent to that"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -437,52 +518,6 @@
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Todays date is 28/04/2020 and tomorrow's date ...</td>\n",
" <td>0.000000</td>\n",
" <td>Neutral</td>\n",
" <td>0.00</td>\n",
" <td>Very objective</td>\n",
" <td>0.748268</td>\n",
" <td>Pretty good</td>\n",
" <td>2</td>\n",
" <td>60</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>52</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>46</td>\n",
" <td>14</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Everyone here is so hardworking. Hardworking p...</td>\n",
" <td>0.700000</td>\n",
" <td>Positive</td>\n",
" <td>0.60</td>\n",
" <td>Objective/subjective</td>\n",
" <td>0.857143</td>\n",
" <td>Pretty good</td>\n",
" <td>4</td>\n",
" <td>112</td>\n",
" <td>17</td>\n",
" <td>18</td>\n",
" <td>3</td>\n",
" <td>95</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>92</td>\n",
" <td>20</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
Expand All @@ -494,72 +529,58 @@
"2 2833047 and 1111 people live in this area. \n",
"3 This sentence doesn't seem to too many commas,... \n",
"4 Todays date is 04/28/2020 for format mm/dd/yyy... \n",
"5 Todays date is 28/04/2020 and tomorrow's date ... \n",
"6 Everyone here is so hardworking. Hardworking p... \n",
"\n",
" sentiment_polarity_score sentiment_polarity sentiment_subjectivity_score \\\n",
"0 0.380000 Positive 0.43 \n",
"1 -0.106818 Negative 0.55 \n",
"2 0.136364 Positive 0.50 \n",
"3 0.375000 Positive 0.75 \n",
"4 0.000000 Neutral 0.00 \n",
"5 0.000000 Neutral 0.00 \n",
"6 0.700000 Positive 0.60 \n",
"\n",
" sentiment_subjectivity spellcheck_score spelling_quality sentences_count \\\n",
"0 Objective/subjective 1.000000 Good 2 \n",
"1 Objective/subjective 0.968802 Quite good 3 \n",
"2 Objective/subjective 1.000000 Good 2 \n",
"3 Pretty subjective 0.923887 Quite good 2 \n",
"4 Very objective 0.711513 Pretty good 2 \n",
"5 Very objective 0.748268 Pretty good 2 \n",
"6 Objective/subjective 0.857143 Pretty good 4 \n",
"\n",
" characters_count spaces_count words_count duplicates_count \\\n",
"0 21 5 4 0 \n",
"1 56 11 11 2 \n",
"2 42 7 6 0 \n",
"3 74 11 13 0 \n",
"4 64 8 9 0 \n",
"5 60 8 8 2 \n",
"6 112 17 18 3 \n",
"\n",
" chars_excl_spaces_count emoji_count whole_numbers_count \\\n",
"0 16 2 0 \n",
"1 45 0 1 \n",
"2 35 0 2 \n",
"3 63 0 0 \n",
"4 56 0 6 \n",
"5 52 0 6 \n",
"6 95 0 0 \n",
"\n",
" alpha_numeric_count non_alpha_numeric_count punctuations_count \\\n",
"0 13 8 1 \n",
"1 43 13 2 \n",
"2 34 8 1 \n",
"3 56 18 7 \n",
"4 48 16 8 \n",
"5 46 14 6 \n",
"6 92 20 3 \n",
"\n",
" stop_words_count dates_count \n",
"0 1 0 \n",
"1 5 0 \n",
"2 3 0 \n",
"3 4 0 \n",
"4 3 1 \n",
"5 3 2 \n",
"6 7 0 "
"4 3 1 "
]
},
"execution_count": 5,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"profiled_text_dataframe = apply_text_profiling(text_dataframe, 'text')\n",
"profiled_text_dataframe"
"profiled_text_dataframe.head()"
]
},
{
Expand Down

0 comments on commit 489342b

Please sign in to comment.