Source code for russiannames.parser

# -*- coding: UTF-8 -*-

import sys, os.path
import logging
from pymongo import MongoClient
import re
from .consts import *

[docs]def use_rule(name, rule): """Use one of rules and apply it to identification""" result = name parts = rule.split() for part in parts: if part[0] == '-': shift = len(part[1:]) result = result[:-shift] elif part[0] == '+': result = result + part[1:] return result
[docs]def guess_by_rules(name, rules): """matches against rules""" for key, value in list(rules.items()): if re.match(key, name): return (key, value) return None
[docs]def norm_name(text): return text.strip('.').title()
NAMES_DB = 'names'
[docs]class NamesParser: def __init__(self): self._conn = MongoClient() self._db = self._conn[NAMES_DB] pass
[docs] def parse(self, text): result = {} ncoll = self._db['names'] scoll = self._db['surnames'] mcoll = self._db['midnames'] # parts = text.split() p = re.compile('\.|\s') # parts = text.split() parts = p.split(text) parts2 = list(map(norm_name, parts)) parts = [part for part in parts2 if len(part) != 0] if len(parts) == 1: the_n = ncoll.find_one({'text' : parts[0]}) if the_n: result = {'format' : 'f','fn' : parts[0]} else: the_s = scoll.find_one({'text' : parts[0]}) if the_s: result = {'format' : 's','sn' : parts[0]} elif len(parts) == 2: if len(parts[0]) == 1 and len(parts[1]) > 1: result = {'format' : 'Fs', 'sn' : parts[1], 'fn_s' : parts[0]} elif len(parts[1]) == 1 and len(parts[0]) > 1: result = {'format' : 'sF', 'sn' : parts[0], 'fn_s' : parts[1]} else: the_n1 = ncoll.find_one({'text' : parts[0]}) the_n2 = ncoll.find_one({'text' : parts[1]}) if not the_n1 and the_n2: result = {'format' : 'sf', 'sn' : parts[0], 'fn' : parts[1]} elif the_n1 and the_n2: if the_n2['count'] > the_n1['count']: result = {'format' : 'sf', 'sn' : parts[0], 'fn' : parts[1]} else: the_m = mcoll.find_one({'text' : parts[1]}) if the_m: result = {'format' : 'fm', 'mn' : parts[1], 'fn' : parts[0]} else: result = {'format' : 'fs', 'sn' : parts[1], 'fn' : parts[0]} elif the_n1 and not the_n2: the_m = mcoll.find_one({'text' : parts[1]}) if the_m: result = {'format' : 'fm', 'mn' : parts[1], 'fn' : parts[0]} else: result = {'format' : 'fs', 'sn' : parts[1], 'fn' : parts[0]} else: res = guess_by_rules(parts[0], SURNAME_POSTRULES) if res: result = {'format': 'sf', 'sn' : parts[0], 'fn' : parts[1]} if res[1] == GENDER_MALE: result['gender'] = 'm' elif res[1] == GENDER_FEMALE: result['gender'] = 'f' elif len(parts) == 3: if len(parts[0]) == 1: if len(parts[1]) == 1: if len(parts[2]) == 1: result = {'format': 'SFM', 'sn_s' : parts[0], 'fn_s' : parts[1], 'mn_s' : parts[2]} else: result = {'format': 'FMs', 'sn' : parts[2], 'fn_s' : parts[0], 'mn_s' : parts[1]} elif len(parts[1]) == 1: if len(parts[2]) == 1: result = {'format': 'sFM', 'sn' : parts[0], 'fn_s' : parts[1], 'mn_s' : parts[2]} else: if len(parts[2]) == 1: result = {'format': 'sfM', 'sn' : parts[0], 'fn' : parts[1], 'mn_s' : parts[2]} else: the_m = mcoll.find_one({'text' : parts[2]}) if the_m: result = {'format': 'sfm', 'sn' : parts[0], 'fn' : parts[1], 'mn' : parts[2]} else: res = guess_by_rules(parts[2], MIDDLENAME_POSTRULES) if res: result = {'format': 'sfm', 'sn' : parts[0], 'fn' : parts[1], 'mn' : parts[2]} if res[1] == GENDER_MALE: result['gender'] = 'm' elif res[1] == GENDER_FEMALE: result['gender'] = 'f' else: the_n = ncoll.find_one({'text' : parts[0]}) if the_n: result = {'format': 'fms', 'sn' : parts[2], 'fn' : parts[0], 'mn' : parts[1]} else: the_n = ncoll.find_one({'text' : parts[1]}) if the_n: result = {'format': 'sfm', 'sn' : parts[0], 'fn' : parts[1], 'mn' : parts[2]} elif len(parts) == 4: if parts[3] == 'Оглы': result = {'format': 'sfm', 'sn' : parts[0], 'fn' : parts[1], 'mn' : ' '.join(parts[2:3]), 'gender' : 'm'} elif parts[3] == 'Кызы': result = {'format': 'sfm', 'sn' : parts[0], 'fn' : parts[1], 'mn' : ' '.join(parts[2:3]), 'gender' : 'f'} if result and 'gender' not in result or ('gender' in result and result['gender'] != 'u'): result['gender'] = '-' if 'mn' in result: m = mcoll.find_one({'text' : result['mn']}) if m and 'gender' in m: result['gender'] = m['gender'] else: res = guess_by_rules(result['mn'], MIDDLENAME_POSTRULES) if res: if res[1] == GENDER_MALE: result['gender'] = 'm' elif res[1] == GENDER_FEMALE: result['gender'] = 'f' if 'fn' in result: n = ncoll.find_one({'text' : result['fn']}) if n and 'gender' in n: result['gender'] = n['gender'] if 'sn' in result and result['gender'] in ['u', '-']: s = scoll.find_one({'text' : result['sn']}) if s and 'gender' in s: result['gender'] = s['gender'] else: res = guess_by_rules(result['sn'], SURNAME_POSTRULES) if res: if res[1] == GENDER_MALE: result['gender'] = 'm' elif res[1] == GENDER_FEMALE: result['gender'] = 'f' result['text'] = text result['parsed'] = True return result else: if 'format' in result: result['parsed'] = True result['text'] = text else: result['parsed'] = False result['text'] = text return result
[docs] def classify(self, sn, fn, mn): result = {} scoll = self._db['surnames'] ncoll = self._db['names'] mcoll = self._db['midnames'] genders = {} ethnics = [] the_m = mcoll.find_one({'text' : mn}) logging.debug('Last name %s, first name %s, middle name %s' % (sn, fn, mn)) if the_m: logging.debug('Middle name found in db %s' % (str(the_m))) if 'gender' in the_m: v = genders.get(the_m['gender'], 0) genders[the_m['gender']] = v + 1 if 'ethnic' in the_m: for e in the_m['ethnic']: if e not in ethnics: ethnics.append(e) else: res = guess_by_rules(mn, MIDDLENAME_POSTRULES) logging.debug('Rules result on middle name %s' % (str(res))) if res: if res[1] == GENDER_MALE: g = 'm' elif res[1] == GENDER_FEMALE: g = 'f' else: g = 'u' v = genders.get(g, 0) genders[g] = v + 1 the_m = ncoll.find_one({'text' : fn}) if the_m: if 'gender' in the_m: v = genders.get(the_m['gender'], 0) genders[the_m['gender']] = v + 1 if 'ethnic' in the_m: for e in the_m['ethnic']: if e not in ethnics: ethnics.append(e) the_m = scoll.find_one({'text' : sn}) if the_m: if 'gender' in the_m: v = genders.get(the_m['gender'], 0) genders[the_m['gender']] = v + 1 if 'ethnic' in the_m: for e in the_m['ethnic']: if e not in ethnics: ethnics.append(e) else: res = guess_by_rules(sn, SURNAME_POSTRULES) if res: if res[1] == GENDER_MALE: g = 'm' elif res[1] == GENDER_FEMALE: g = 'f' else: g = 'u' v = genders.get(g, 0) genders[g] = v + 1 res = guess_by_rules(sn, SURN_NATIONAL_RULES) if res: for r in res[1]: if r not in ethnics: ethnics.append(r) alist = list(genders.items()) logging.debug('Genders list %s' % (genders)) thedict = sorted(alist, key=lambda x: x[1], reverse=True) result['ethnics'] = ethnics if len(thedict) > 1 and thedict[0][0] == 'u': result['gender'] = thedict[1][0] elif len(thedict) > 0: result['gender'] = thedict[0][0] return result
if __name__ == '__main__': import locale np = NamesParser() print(np.parse('Исинбаев Иван Моисеевич')) print(np.parse(u'Иванов Шалва Ицхакович')) print(np.parse(u'Иван Алексеевич')) print(np.parse(u'Сидор Федоров')) print(np.parse(u'Акимов Б.В.')) print(np.parse(u'А.Н. Хомяков')) print(np.classify('Козлевич', 'Иннокентий', 'Мафусаилович'))