import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
header = None
source_data = []
for rn, row in enumerate(reader):
if rn == 0:
header = row
else:
source_data.append(row)
print 'Loaded!'
import spacy
print spacy.__version__
en_nlp = spacy.load('en')
de_nlp = spacy.load('de')
import copy
from collections import defaultdict
all_ratios = []
for rn, row in enumerate(source_data):
collected_nodes = defaultdict(int)
collected_edges = defaultdict(int)
n_german_tokens = 0
for cn, cell in enumerate(row[2:]):
if cn == 0:
doc = de_nlp(unicode(cell))
for token in doc:
if token.is_stop == False and token.pos_ not in ['PUNCT', 'SPACE']:
n_german_tokens += 1
continue
doc = en_nlp(unicode(cell))
all_tokens = ['START',]
for token in doc:
if token.is_stop == False and token.pos_ not in ['PUNCT', 'SPACE'] \
and token.text.lower() not in ['n\'t', '\'ll', 'and', '\'s']:
all_tokens.append(token.text.lower())
all_tokens.append('END')
for a in range(0, len(all_tokens)):
collected_nodes[all_tokens[a]] += 1
if a + 1 < len(all_tokens):
collected_edges[(all_tokens[a], all_tokens[a + 1])] += 1
all_ratios.append([(len(collected_nodes) / float(n_german_tokens)), rn])
sorted_all_ratios = copy.deepcopy(all_ratios)
sorted_all_ratios.sort()
print '5 lowest ratios', sorted_all_ratios[:5]
sorted_all_ratios.sort(reverse=True)
print '5 highest ratios', sorted_all_ratios[:5]
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
def graph_chunks(scored_rn_to_graph):
from pylab import rcParams
rcParams['figure.figsize'] = 30, 30
sns.set_style("whitegrid")
for rn, row in enumerate(source_data):
if rn in scored_rn_to_graph:
pass
else:
continue
print
print (rn + 2), '-----------------------------------------------------------------------------'
print
print row[1]
collected_nodes = defaultdict(int)
collected_edges = defaultdict(int)
n_german_tokens = 0
for cn, cell in enumerate(row[2:]):
if cn == 0:
doc = de_nlp(unicode(cell))
for token in doc:
if token.is_stop == False and token.pos_ not in ['PUNCT', 'SPACE']:
n_german_tokens += 1
continue
doc = en_nlp(unicode(cell))
all_tokens = ['START',]
for token in doc:
if token.is_stop == False and token.pos_ not in ['PUNCT', 'SPACE'] \
and token.text.lower() not in ['n\'t', '\'ll', 'and', '\'s']:
all_tokens.append(token.text.lower())
all_tokens.append('END')
for a in range(0, len(all_tokens)):
collected_nodes[all_tokens[a]] += 1
if a + 1 < len(all_tokens):
collected_edges[(all_tokens[a], all_tokens[a + 1])] += 1
print
print 'n_german_tokens', n_german_tokens, 'n collected_nodes', len(collected_nodes)
print 'RATIO', (len(collected_nodes) / float(n_german_tokens))
G = nx.Graph()
for k, v in collected_edges.iteritems():
G.add_edge(k[0], k[1], weight=v)
pos = nx.drawing.layout.spring_layout(G, iterations=100, weight="weight")
fig, ax = plt.subplots()
for k, v in collected_edges.iteritems():
from_node = pos[k[0]]
to_node = pos[k[1]]
ax.plot([from_node[0], to_node[0]],
[from_node[1], to_node[1]],
color='#999999', linestyle='-', linewidth=(v * 2), zorder=1)
for k, v in collected_nodes.iteritems():
selected_color = '#000000'
if k == 'START' or k == 'END':
selected_color = '#FF0000'
ax.text(pos[k][0], pos[k][1], k, fontsize=(v * 2), color=selected_color)
ax.set_xlim(-1.1, 1.1)
ax.set_ylim(-1.1, 1.1)
plt.axis('off')
plt.show()
print '----------------------------------------------------'
print '5 LOWEST RATIO CHUNKS'
print '----------------------------------------------------'
sorted_all_ratios.sort()
ratio_indexes = []
for r in sorted_all_ratios[:5]:
ratio_indexes.append(r[1])
graph_chunks(ratio_indexes)
print '----------------------------------------------------'
print '5 HIGHEST RATIO CHUNKS'
print '----------------------------------------------------'
sorted_all_ratios.sort(reverse=True)
ratio_indexes = []
for r in sorted_all_ratios[:5]:
ratio_indexes.append(r[1])
graph_chunks(ratio_indexes)