Source code for russiannames.reader

# -*- coding: UTF-8 -*-

import sys, os.path


[docs]class NameReader: def __init__(self, path): self.path = path self.names = {} self.surnames = {} self.midnames = {} self.n = 0 self.f_all = open('all.txt', 'w') self.f_o = open('not3.txt', 'w')
[docs] def process(self): dirs = os.listdir(self.path) for d in dirs: self.process_dir(os.path.join(self.path, d))
[docs] def process_dir(self, dirname): files = os.listdir(dirname) for fname in files: filename = os.path.join(dirname, fname) f = open(filename) lines = f.read().splitlines() f.close() for line in lines: parts = line.split() if len(parts) != 3: self.f_o.write(line.encode('utf8') + '\n') self.n += 1 else: (surname, name, midname) = parts v = self.names.get(name, 0) self.names[name] = v + 1 v = self.surnames.get(surname, 0) self.surnames[surname] = v + 1 v = self.midnames.get(midname, 0) self.midnames[midname] = v + 1 self.f_all.write(line.encode('utf8') + '\n')
[docs] def write_dict(self, dict, filename): f = open(filename, 'w') thedict = sorted(dict.items(), key=lambda x: x[1], reverse=True) for key, value in thedict: f.write(('%s\t%d' %(key, value)).encode('utf8') + '\n')
[docs] def save(self): self.write_dict(self.names, 'names.tsv') self.write_dict(self.surnames, 'surnames.tsv') self.write_dict(self.midnames, 'midnames.tsv')
[docs]def name_parse(dirname): reader = NameReader(dirname) reader.process() reader.save() print(reader.n) print(len(reader.names.keys()), len(reader.midnames.keys()), len(reader.surnames.keys())) thedict = sorted(reader.names.items(), lambda x, y: x[1], reverse=True) for key, value in thedict[00:50]: print(value, key.encode('utf8'))
if __name__ == '__main__': name_parse(sys.argv[1])