#!/usr/bin/python
# -*- coding: utf-8 -*-

import os, codecs, re, sys
from textblob_de import TextBlobDE
from lxml import etree

tree = etree.parse('Das_Eulenhaus.xml')

#
#
#

sentences = tree.xpath('//sentence')

for s in sentences:

    sentence_type = 'UNKNOWN'

    if s.text.find(u'„') > -1 or s.text.find(u'“') > -1:
        sentence_type = 'reported_speech'

    s.set('type', sentence_type)

#
#
#

paragraphs = tree.xpath('//p')

for p in paragraphs:

    paragraph_type = 'unknown'

    sentence_types = {}

    sentences = p.xpath('descendant::sentence')

    for s in sentences:
        try:
            sentence_types[s.get('type')] += 1
        except KeyError:
            sentence_types[s.get('type')] = 1

    if len(sentence_types) == 0:
        paragraph_type = 'P_ERROR'
    else:
        if len(sentence_types) == 1:
            paragraph_type = 'P_' + sentence_types.keys()[0].upper()
        else:
            paragraph_type = 'P_MIXED'

    
    p.set('type', paragraph_type)

#   
#
#   

print etree.tostring(tree, pretty_print=True)
