mkr2cat.py
# coding: utf-8
'''
Converts a bibliographic database in MarcMaker format to "id format".
The encoding of the original database is utf-8; the output is encoded as
latin1 (previously, decomposed characters are composed).
'''
import sys
import unicodedata
#import os
dbname = sys.argv[1]
encoding_in = 'utf-8'
encoding_out = 'latin1'
in_file = open('%s.mkr' % dbname)
out_file = open('%s.id' % dbname, 'w')
out_file.write('!ID 0\n')
for line in in_file:
if line[:1] == '=':
tag = line[1:4]
if tag > '900':
continue
elif tag < '010':
data = line[6:]
out = '!v%s!%s' % (tag, data)
else:
line = line.decode(encoding_in)
indicators = line[6:8].replace('\\', '#')
subfields = line[8:].replace('$', '^').replace('{dollar}', '$')
subfields = unicodedata.normalize('NFC', subfields)
out = '!v%s!%s%s' % (tag, indicators, subfields)
out = out.encode(encoding_out, 'ignore')
else:
out = '\n!ID 0\n'
out_file.write(out)
out_file.close()