====== Procesamiento de archivos con Python ====== En desarrollo Posibles herramientas: * [[http://www.crummy.com/software/BeautifulSoup/|Beautiful Soup]] * [[http://docs.python.org/lib/module-HTMLParser.html|HTMLParser]] Hay dos tipos de reeemplazos: * sobre el código fuente: no requieren manejar la estructura HTML * sobre etiquetas/atributos: requieren (o son más simples con) un parser de HTML Cómo eliminar los comentarios usando BeautifulSoup: from BeautifulSoup import BeautifulSoup, Comment soup = BeautifulSoup("""1 23""") comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] print soup Cómo detectar ocurrencias de un elemento: >>> soup = BeautifulSoup(open('v48n2a03.html').read()) >>> soup.findAll(name='link') [] >>> meta = soup('meta') >>> meta [, , , , ] >>> [el.extract() for el in meta] >>> soup('meta') [] >>> # O también... >>> for tag in soup('meta') ... tag.extract() >>> >>> [el.extract() for el in soup.findAll(name='link')] >>> >>> import re >>> bold = soup.findAll(name='span', attrs={'class': re.compile('cmbx.+')}) >>> italic = soup.findAll(name='span', attrs={'class': re.compile('cm.i.+')}) Funciones: # Reemplaza un tag por otro, preservando el contenido # Usado en 01.1 (bold), 01.2 (italic), 04 (h3 -> p) def replaceTag(tag1, att1={}, tag2): for t in soup.findAll(tag1, att1): newTag = Tag(soup, tag2) newTag.insert(0, t.renderContents()) t.replaceWith(newTag) # Elimina un tag, preservando el contenido # Usado en 03.1 (quitar span), 03.2 (quitar div) def remove Tag(tag, att={}): for t in soup.findAll(tag, att): t.replaceWith(t.renderContents()) # Elimina un atributo # Usado en 06 (quitar class) def removeAttribute(att): for t in soup(attrs={att: re.compile('.+')}): del(t[att]) # Cambia un atributo por otro # Usado en 05 (align="middle") def changeAttribute(): for t in soup(attrs={}): del(t[]) t[] = # Antepone... def prepend(tag, attrs={}, text): for t in soup(tag, attrs): # Wraps an element with another def wrapElement(el, wrapperTag): wrapper = Tag(soup, wrapperTag) # e.g. 'p' pos = el.parent.contents.index(el) el.parent.insert(pos, wrapper) wrapper.append(el) Para usar Tidy, necesitamos instalar TidyLib más el wrapper para Python (µTidylib) Script: import os, re from BeautifulSoup import BeautifulSoup, Tag, Comment def remove_comments(): comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] def remove_tags(tagList): for tag in tagList: [t.extract() for t in soup(tag)] def clean_bold(): replaceTag('span', {'class': re.compile('cmb.+')}, 'b') def clean_italics(): replaceTag('span', {'class': re.compile('cmti.+')}, 'i') def wrap_tables(): for t in soup('table', attrs={'class': 'equation'}): # REVISAR wrapElement(t, 'p') def add_label_width(): def remove_spans(): removeTag('span') def remove_divs(): removeTag('div') def remove_h3(): def remove_class(): removeAttribute('class') def add_font(): def tidy(): def remove_brackets(): def clean(file): remove_comments() remove_tags(['meta', 'link']) clean_bold() clean_italics() wrap_tables() add_label_width() remove_spans() remove_divs() remove_h3() remove_class() add_font() tidy() remove_brackets() for file in os.listdir('.'): clean(file)