# -*- coding: utf-8 -*-
import re
import codecs
import sys
from pprint import pprint
f_exclusions_name = 'words_posted2.txt' #file with words to exclude
f_exclusions_name2 = 'words_I_want_to_exclude_3.txt' #file with words I want to exclude
file_in_name = 'anki_out3.txt' #input file name
file_out_htm_name = 'out3.htm' #output html file name
file_out_txt_name = 'out3.txt'#output txt file name
word_to_exclude_next_time_file_name = 'words_posted3.txt'
html_head = u'''
<html>
<head>
<title>Words list 3</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
</head>
<body>
'''
to_exclude_dict = {} #dict of words to exclude
excluded_words_dict = {} #dict of excluded words
word_to_exclude_next_time_dict = {}
#Forming to_exclude_dict dictionary
try:
f_exclusions = codecs.open(f_exclusions_name,encoding='UTF-8')
except IOError:
print('Can\'t open the file {0}'.format(f_exclusions_name))
sys.exit(1)
else:
for line in f_exclusions:
line = line.rstrip('\n\r ')
to_exclude_dict[line] = True
finally:
f_exclusions.close()
try:
f_exclusions = codecs.open(f_exclusions_name2,encoding='UTF-8')
except IOError:
print('Can\'t open the file {0}'.format(f_exclusions_name2))
else:
for line in f_exclusions:
line = line.rstrip('\n\r ')
to_exclude_dict[line] = True
finally:
f_exclusions.close()
try:
f_in = codecs.open(file_in_name,encoding='UTF-8')
except IOError:
print('Can\'t open the file {0}'.format(file_in_name))
else:
try:
f_out_htm = codecs.open(file_out_htm_name,mode='w',encoding='UTF-8')
except IOError:
print('Can\'t open the file {0}'.format(file_out_htm_name))
else:
try:
f_out_txt = codecs.open(file_out_txt_name,mode='w',encoding='UTF-8')
except IOError:
print('Can\'t open the file {0}'.format(file_out_txt_name))
else:
try:
word_to_exclude_next_time_file = codecs.open(word_to_exclude_next_time_file_name,mode='w',encoding='UTF-8')
except IOError:
print('Can\'t open the file {0}'.format(word_to_exclude_next_time_file_name))
else:
f_out_htm.write(html_head)
f_out_htm.write(u'<table border="1" cellspacing ="0" cellpadding ="3" width="100%">')
f_out_htm.write(u'<tr>')
f_out_htm.write(''.join((u'<th>',u'№',u'</th><th>',u'Слово',u'</th><th>',u'Значение',u'</th>')))
f_out_htm.write(u'</tr>\n')
pattern1 = re.compile(r'\t')
pattern2 = re.compile(r'</?span.*?>|^ +| +$|\n')
pattern3 = re.compile(r'<img src="(.*?)" />')
i = 0
for j, line in enumerate(f_in):
#print(u'Line in: {0}\n'.format(line))
strings = pattern1.split(line)
word = pattern2.sub('',strings[0]) #striped of html-garbage string[0]
word_to_exclude_next_time_file.write(word+u'\n')
#print(u'Word: {0}'.format(word))
#Adding the found values in the to_exclude_dict into the excluded_words_dict
if word in to_exclude_dict:
#print(u'Excluded: {0}\n'.format(word))
excluded_words_dict[word] = True
continue #no parsing for the data which is in the to_exclude_dict
f_out_txt.write(word+'\n')
f_out_htm.write(u'<tr>')
i += 1 #number of the table row
line = u'<td>'+'{0}'.format(i)+'</td><td>'+strings[0]+u'</td>'+u'<td>'
definition = strings[2]
image = pattern3.search(definition)
if image:
#print(u'Image detected: {0}'.format(image.group(0)))
#adding my public DropBox address
definition = pattern3.sub(u'<img style="border:5px solid white" src="http://dl.dropbox.com/u/7801003/Different.media/\g<1>" width="50%" />',strings[2])
#print(u'Replaced definition string: {0}\n'.format(definition))
if definition: #there may be no definition
line += definition
else:
line += ' ' #otherwise the cells are empty and have no borders in the html
#Adding examples
if strings[3]: #there may be no examples
line += u'<p><span style="color:blue; font-style:italic;">'+strings[3]+u'</span></p>'
line += u'</td>'
f_out_htm.write(line)
f_out_htm.write(u'</tr>\n')
f_out_htm.write(u'</table>')
finally:
word_to_exclude_next_time_file.close()
finally:
f_out_txt.close()
finally:
f_out_htm.close()
finally:
f_in.close()
#the set of words which haven't been excluded for some reason
not_excluded = set(to_exclude_dict) - set(excluded_words_dict)
print(u'\nWords that haven\'t been excluded:'.format(word))
for word in not_excluded:
print(u'{0}'.format(word))
print('\nParsing completed')