from glob import glob
txt_file_names = []
for t in glob('../depart_vision_text/*.txt'):
if 'dedicated building' not in t:
txt_file_names.append(t)
print 'len(txt_file_names)', len(txt_file_names)
import spacy
nlp = spacy.load('en')
import nltk
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
import codecs, re, string
def is_token_okay(t):
is_okay = True
if t.lower() in sw or t in string.punctuation or \
t.lower() in ['\'s', '\u2019s', 'would', '\u2013', u'’s', u'“', u'‘', u'—', u'”', u'–']:
is_okay = False
try:
noop = int(t)
is_okay = False
except:
pass
return is_okay
# -----------------------------------------------------------------------
features_all_texts = []
for txt_file_name in txt_file_names:
text = codecs.open(txt_file_name, 'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text.replace(u'\ufeff', ''))
doc = nlp(unicode(text))
features = []
for s in doc.sents:
#okay_tokens = []
#all_tokens = []
#for token in s:
# all_tokens.append(token.text.lower())
# if is_token_okay(token.text.lower()):
# okay_tokens.append(token.text.lower())
#features += okay_tokens
for chunk in s.noun_chunks:
if chunk.text.lower() not in sw and len(chunk.text.lower().split(' ')) > 1:
features.append(chunk.text.lower())
#for NGRAM_LENGTH in [2, 3, 4]:
# for a in range(0, len(all_tokens) - NGRAM_LENGTH + 1):
# one_ngram = all_tokens[a: a + NGRAM_LENGTH]
# is_ngram_okay = True
# for token in one_ngram:
# if is_token_okay(token) == False:
# is_ngram_okay = False
# if is_ngram_okay:
# features.append(' '.join(one_ngram))
features_all_texts.append(features)
print
print 'len(features_all_texts)', len(features_all_texts)
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(features_all_texts)
corpus = [dictionary.doc2bow(text) for text in features_all_texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
from collections import defaultdict, Counter
document_frequencies = []
for k, v in dictionary.iteritems():
if dictionary.dfs[k] > 1:
if v.split(' ')[0] in ['a', 'the', 'our', 'an', 'both', 'this', 'these', 'their', 'other', 'its',
'such', 'that', 'top', 'faculty', 'many']:
pass
elif v.startswith('tenure-'):
pass
else:
document_frequencies.append([dictionary.dfs[k], v])
document_frequencies.sort(reverse=True)
print 'len(document_frequencies)', len(document_frequencies)
for df in document_frequencies:
print df
for df in document_frequencies:
print df[1]
files_to_departments = {'Academic Plan for the Department of African and African American Studies.txt': 'African and African-American Studies',
'AcademicVisionBiologyFINAL.txt': 'Biology',
'AcademicVisionEnglish.txt': 'English',
'Academic_Vision_EPS-12-08-17.txt': 'Earth and Planetary Sciences',
'Academic Vision Statement - Education.txt': 'Education',
'AHA Vision Statement Dec 2017.txt': 'Art History and Archeology',
'AMCS academic vision statement 12.1.17-2.txt': 'American Culture Studies',
'Anthropology vision fall 2017 v 3.txt': 'Anthropology',
'A Vision for Chemistry 12-12-17.txt': 'Chemistry',
'Center for the Humanities Vision Statement Jan 2018.txt': 'Center for the Humanities',
'Classics Vision letterhead 12.01.17.txt': 'Classics',
'Econ Dept Vision Final.txt': 'Economics',
'EnSt_Vision_Statement.txt': 'Environmental Studies',
'FMS ACADEMIC VISION STATEMENT 12 01 17.txt': 'Film and Media Studies',
'German_Final_Vision_Statement .txt': 'German',
'HDW Next Steps 2017.txt': 'Humanities Digital Workshop',
'History Department Academic Vision.txt': 'History',
'IAS longterm plan.txt': 'International Area Studies',
'IPHVision.txt': 'Interdisciplinary Project in the Humanities',
'Linguistics Program Vision Statement.txt': 'Linguistics',
'MathDept10yearPlanNovember2017.txt': 'Mathematics',
'MCSS-Mission-Vision 1-2-2018.txt': 'McDonnell Center for the Space Sciences',
'Music DepartmentAcademic VisionDecember 2017.txt': 'Music',
'PAD Vision Statement (1).txt': 'Performing Arts Department',
'PBS_Vision_120117.txt': 'Psychological and Brain Sciences',
'PhilosophyVisionStatementFinal.txt': 'Philosophy',
'Physics_Department_Vision_for_Growth.txt': 'Physics',
'political science vision statement.txt': 'Political Science',
'RLL Vision Statement December 2017.txt': 'Romance Languages and Literatures',
'Sociology Vision Statement Dec 2017.txt': 'Sociology',
'Spanish Section Vision Statement .txt': 'Spanish',
'Urban Studies Prg.Planning Document.04.02.17. FINAL.txt': 'Urban Studies',
'Vision East Asian Studies Watt 17Dec01.txt': 'East Asian Studies',
'Vision for the department Dec 1 - JINELC.txt': 'Jewish, Islamic and Near Eastern Languages and Cultures',
'vision statement_Comparative Literature_2017.txt': 'Comparative Literature',
'Vision Statement EALC .txt': 'East Asian Languages and Cultures',
'Vision Statement Latin American Studies 2017.txt': 'Latin American Studies',
'Weidenbaum Center vision.txt': 'Weidenbaum Center',
'WGSSVisionStatement_12-1-17Final.txt': 'Women, Gender, and Sexuality Studies',}
key_terms = {'african and african american studies': 'african and african american studies',
'black studies': 'african and african american studies',
'aging societies': 'aging societies',
'american culture': 'american culture',
'american culture studies': 'american culture',
'american history': 'american history',
'ancient art': 'ancient art',
'ancient philosophy': 'ancient philosophy',
'artificial intelligence': 'artificial intelligence',
'big data': 'big data',
'big data initiatives': 'big data',
'computational approaches': 'computational methods',
'computational methods': 'computational methods',
'computational tools': 'computational methods',
'data analysis': 'data science',
'data collection': 'data science',
'data science': 'data science',
'data sets': 'data science',
'digital history': 'digital humanities',
'digital humanities': 'digital humanities',
'digital methods': 'applied data science',
'large data sets': 'big data',
'machine learning': 'applied data science',
'quantitative approaches': 'quantitative approaches',
'quantitative biology': 'quantitative approaches',
'statistical methods': 'quantitative approaches',
'area studies': 'area studies',
'area studies programs': 'area studies',
'art history': 'art history',
'biological problems': 'biological problems',
'biological systems': 'biological systems',
'book history': 'book history',
'brain sciences': 'brain sciences',
'cognitive neuroscience': 'brain sciences',
'cognitive science': 'brain sciences',
'chemical reactions': 'chemical reactions',
'chinese history': 'chinese history',
'climate change': 'climate change',
'comparative literature': 'comparative literature',
'computer science': 'computer science',
'contemporary literature': 'contemporary literature',
'creative writing': 'creative writing',
'cultural history': 'cultural history',
'digital media': 'digital media',
'diverse faculty': 'diverse faculty',
'diversity science': 'diversity science',
'early modern studies': 'early modern studies',
'east asia': 'east asia',
'east asian languages': 'east asia',
'east asian studies': 'east asia',
'economic inequality': 'economic inequality',
'environmental earth sciences': 'environmental earth sciences',
'environmental studies': 'environmental studies',
'ethnic studies': 'ethnic studies',
'ethnographic work': 'ethnographic work',
'extraterrestrial materials': 'extraterrestrial materials',
'film and media studies': 'film and media studies',
'formal modeling': 'formal modeling',
'french literature': 'french literature',
'further development': 'further development',
'global studies': 'global studies',
'grad students': 'graduate education',
'graduate courses': 'graduate education',
'graduate education': 'graduate education',
'graduate programs': 'graduate education',
'graduate studies': 'graduate education',
'phd students': 'graduate education',
'higher education': 'higher education',
'hispanic studies': 'hispanic studies',
'historical questions': 'historical questions',
'human experience': 'human experience',
'innovative courses': 'innovative courses',
'american intellectual history': 'intellectual history',
'cross-disciplinary collaborations': 'interdisciplinary collaboration',
'different disciplines': 'interdisciplinary collaboration',
'different fields': 'interdisciplinary collaboration',
'disciplinary boundaries': 'interdisciplinary collaboration',
'increased collaboration': 'interdisciplinary collaboration',
'interdisciplinary approaches': 'interdisciplinary collaboration',
'interdisciplinary connections': 'interdisciplinary collaboration',
'interdisciplinary courses': 'interdisciplinary collaboration',
'interdisciplinary inquiry': 'interdisciplinary collaboration',
'interdisciplinary study': 'interdisciplinary collaboration',
'interdisciplinary work': 'interdisciplinary collaboration',
'international relations': 'international relations',
'internship opportunities': 'internship opportunities',
'language learning': 'language learning',
'second language acquisition': 'language learning',
'language science': 'language science',
'language study': 'language study',
'latin america': 'latin america',
'latin american studies': 'latin american studies',
'latinx studies': 'latinx studies',
'leadership positions': 'leadership positions',
'leadership roles': 'leadership roles',
'literary studies': 'literary studies',
'community engagement': 'local communities',
'community outreach': 'local communities',
'local communities': 'local communities',
'local organizations': 'local communities',
'st. louis': 'local communities',
'university city': 'local communities',
'material culture': 'material culture',
'media studies': 'media studies',
'medical humanities': 'medical humanities',
'medical school': 'medical school',
'near eastern languages': 'near eastern languages',
'outside speakers': 'outside speakers',
'performance studies': 'performance studies',
'performing arts': 'performing arts',
'planetary exploration': 'planetary sciences',
'planetary sciences': 'planetary sciences',
'plate tectonics': 'plate tectonics',
'political science': 'political science',
'political theory': 'political theory',
'popular music': 'popular music',
'post-doctoral fellows': 'post-doctoral fellows',
'postdoctoral fellows': 'post-doctoral fellows',
'professional programs': 'professional programs',
'public affairs': 'public affairs',
'public discourse': 'public discourse',
'public health': 'public health',
'public policy': 'public policy',
'racial violence': 'racial violence',
'religious studies': 'religious studies',
'research methods': 'research methods',
'romance languages': 'romance languages',
'sam fox': 'sam fox',
'sexuality studies': 'sexuality studies',
'silicon valley': 'silicon valley',
'social class': 'social class',
'social justice': 'social justice',
'social media': 'social media',
'social movements': 'social movements',
'social science': 'social science',
'social sciences': 'social sciences',
'social work': 'social work',
'southern california': 'southern california',
'space sciences': 'space sciences',
'training opportunities': 'training opportunities',
'translation studies': 'translation studies',
'first-year students': 'undergraduate education',
'introductory courses': 'undergraduate education',
'undergraduate education': 'undergraduate education',
'undergraduate majors': 'undergraduate education',
'undergraduate students': 'undergraduate education',
'undergraduate studies': 'undergraduate education',
'undergraduate teaching': 'undergraduate education',
'urban design': 'urban design',
'visual arts': 'visual arts',}
terms_frequencies = {}
terms_departments = {}
for k, v in key_terms.iteritems():
terms_frequencies[v] = 0
terms_departments[v] = []
for n, text in enumerate(features_all_texts):
has_key_term = False
n_key_terms = 0
matching_terms = []
for token in text:
if token in key_terms:
has_key_term = True
n_key_terms += 1
matching_terms.append(key_terms[token])
matching_terms = sorted(list(set(matching_terms)))
department_name = files_to_departments[txt_file_names[n].split('/')[-1]]
for m in matching_terms:
terms_frequencies[m] += 1
terms_departments[m].append(department_name)
print has_key_term, n_key_terms, department_name
%matplotlib inline
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style("white")
print 'len(terms_frequencies)', len(terms_frequencies)
# https://matplotlib.org/examples/color/colormaps_reference.html
# https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html#
# https://publicaffairs.wustl.edu/assets/color-palettes/
cmap = mpl.colors.ListedColormap(['#a51417', '#007360', '#6c7373'])
wordcloud = WordCloud(
font_path='/usr/share/fonts/truetype/liberation/LiberationSerif-Bold.ttf',
width=1000,
height=1000,
margin=20,
background_color='white',
prefer_horizontal=0.5,
colormap=cmap).fit_words(terms_frequencies)
plt.figure()
plt.figure(figsize=(15, 15))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()