#!/usr/bin/python
# -*- coding: utf-8 -*-

import os, codecs, re, sys

token_counts = {}

text = re.sub('\s+', ' ', codecs.open(sys.argv[1], 'r', encoding='utf-8').read())

tokens = re.split(u'(\s|~|`|!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\[|\{|\]|\}|\;|\:|\'|\"|\<|\,|\>|\.|\?|\/|\\|\_|\-|\+|\=|»|«|–|\„|\“|\’|\–|\›|\‹|\‚|\‘|\…|–)', text)

for t in tokens:
    try:
        token_counts[t.lower()] += 1
    except KeyError:
        token_counts[t.lower()] = 1

token_list = []
for k, v in token_counts.iteritems():
    token_list.append([float(v) / float(len(tokens)), k])

token_list.sort(reverse=True)

for t in token_list:
    print t[1] + '\t' + str(t[0])
