Skip to content

Instantly share code, notes, and snippets.

@svenski
Created April 6, 2019 10:30
Show Gist options
  • Save svenski/a433a823511a0f9a0941deba93fa0d2f to your computer and use it in GitHub Desktop.
Save svenski/a433a823511a0f9a0941deba93fa0d2f to your computer and use it in GitHub Desktop.

Revisions

  1. svenski created this gist Apr 6, 2019.
    53 changes: 53 additions & 0 deletions text2graph.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,53 @@
    import spacy
    import nltk
    from nltk.corpus import gutenberg
    #nlp = spacy.load('en')
    nlp = spacy.load('en_core_web_sm')
    import random
    import pandas as pd
    import matplotlib.pyplot as plt

    import re

    def main():
    emma = gutenberg.raw('austen-emma.txt')
    #TODO: replace -- with space
    emma = emma.replace('--', ' ')

    parsed_emma = nlp(emma)

    my_sample = list(parsed_emma.sents)

    sample=[]
    for sent in my_sample:
    sent = re.sub("\s+"," ",sent.text) # clean up the whitespace
    sample.append(sent)

    entities=[]
    type_entity=[]
    sentences=[]
    for sent in sample:
    parsed_sentence=nlp(sent)
    for ent in parsed_sentence.ents:
    entities.append(ent.text)
    sentences.append(sent)
    type_entity.append(ent.label_)

    ents=pd.DataFrame({'Sentence':sentences,'Entity':entities,'Entity_type':type_entity})

    people = ents[ents.Entity_type=='PERSON']
    people_in_sent = people.groupby('Sentence')['Entity'].agg({set}).reset_index()
    people_in_sent['num_person'] = people_in_sent.set.apply(lambda x : len(x))
    mult = people_in_sent[people_in_sent.num_person == 2]
    mult['pp'] = mult.set.apply(lambda x: tuple(list(x)))
    tt = mult.groupby('pp').size().to_frame('count').sort_values('count', ascending = False).head(n = 10).reset_index()

    import networkx as nx
    import matplotlib.pyplot as plt

    G=nx.Graph()

    for _, row in tt.iterrows():
    G.add_edge(row.pp[0], row.pp[1], weight=row['count'])

    nx.draw(G)