User Tools

Site Tools


mkr2cat.py

This is an old revision of the document!


mkr2cat.py

# coding: utf-8
 
'''
Converts a bibliographic database in MarcMaker format to "id format".
 
The encoding of the original database is utf-8; the output is encoded as
latin1 (previously, decomposed characters are composed).
'''
 
import sys
import unicodedata
#import os
dbname = sys.argv[1]
 
encoding_in = 'utf-8'
encoding_out = 'latin1'
 
in_file = open('%s.mkr' % dbname)
out_file = open('%s.id' % dbname, 'w')
 
out_file.write('!ID 0\n')
for line in in_file:
    if line[:1] == '=':
        tag = line[1:4]
        if tag > '900':
            continue
        elif tag < '010':
            data = line[6:]
            out = '!v%s!%s' % (tag, data)
        else:
            line = line.decode(encoding_in)
            indicators = line[6:8].replace('\\', '#')
            subfields = line[8:].replace('$', '^').replace('{dollar}', '$')
            subfields = unicodedata.normalize('NFC', subfields)
            out = '!v%s!%s%s' % (tag, indicators, subfields)
            out = out.encode(encoding_out, 'ignore')
    else:
        out = '\n!ID 0\n'
    out_file.write(out)
 
out_file.close()
mkr2cat.py.1334261551.txt.gz · Last modified: 12/04/2012 00:00 (external edit)