This shows you the differences between two versions of the page.
mkr2cat.py [12/04/2012 00:00] |
mkr2cat.py [12/04/2012 00:00] (current) |
||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== mkr2cat.py ====== | ||
+ | |||
+ | |||
+ | <code python> | ||
+ | # coding: utf-8 | ||
+ | |||
+ | ''' | ||
+ | Converts a bibliographic database in MarcMaker format to "id format". | ||
+ | |||
+ | The encoding of the original database is utf-8; the output is encoded as | ||
+ | latin1 (previously, decomposed characters are composed). | ||
+ | ''' | ||
+ | |||
+ | import sys | ||
+ | import unicodedata | ||
+ | #import os | ||
+ | dbname = sys.argv[1] | ||
+ | |||
+ | encoding_in = 'utf-8' | ||
+ | encoding_out = 'latin1' | ||
+ | |||
+ | in_file = open('%s.mkr' % dbname) | ||
+ | out_file = open('%s.id' % dbname, 'w') | ||
+ | |||
+ | out_file.write('!ID 0\n') | ||
+ | for line in in_file: | ||
+ | if line[:1] == '=': | ||
+ | tag = line[1:4] | ||
+ | if tag > '900': | ||
+ | continue | ||
+ | elif tag < '010': | ||
+ | data = line[6:] | ||
+ | out = '!v%s!%s' % (tag, data) | ||
+ | else: | ||
+ | line = line.decode(encoding_in) | ||
+ | indicators = line[6:8].replace('\\', '#') | ||
+ | subfields = line[8:].replace('$', '^').replace('{dollar}', '$') | ||
+ | subfields = unicodedata.normalize('NFC', subfields) | ||
+ | out = '!v%s!%s%s' % (tag, indicators, subfields) | ||
+ | out = out.encode(encoding_out, 'ignore') | ||
+ | else: | ||
+ | out = '\n!ID 0\n' | ||
+ | out_file.write(out) | ||
+ | |||
+ | out_file.close() | ||
+ | </code> |