Procesamiento de archivos con Python

<note>En desarrollo</note>

Posibles herramientas:

Hay dos tipos de reeemplazos:

sobre el código fuente: no requieren manejar la estructura HTML
sobre etiquetas/atributos: requieren (o son más simples con) un parser de HTML

Cómo eliminar los comentarios usando BeautifulSoup:

from BeautifulSoup import BeautifulSoup, Comment
soup = BeautifulSoup("""1<!--The loneliest number-->
                        <a>2<!--Can be as bad as one--><b>3""")
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
print soup

Cómo detectar ocurrencias de un elemento:

>>> soup = BeautifulSoup(open('v48n2a03.html').read())
>>> soup.findAll(name='link')
[<link rel="stylesheet" type="text/css" href="v48n2a03.css" />]
>>> meta = soup('meta')
>>> meta
[<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />, <meta name="generator" content="TeX4ht (http://www.cse.ohio-state.edu/~gurari/TeX4ht/)"/>, <meta name="originator" content="TeX4ht (http://www.cse.ohio-state.edu/~gurari/TeX4ht/)" />, <meta name="src" content="v48n2a03.tex" />, <meta name="date" content="2008-02-26 11:36:00" />]
>>> [el.extract() for el in meta]
>>> soup('meta')
[]
>>> # O también...
>>> for tag in soup('meta')
...     tag.extract()
>>>
>>> [el.extract() for el in soup.findAll(name='link')]
>>>
>>> import re
>>> bold = soup.findAll(name='span', attrs={'class': re.compile('cmbx.+')})
>>> italic = soup.findAll(name='span', attrs={'class': re.compile('cm.i.+')})

Funciones:

# Reemplaza un tag por otro, preservando el contenido
# Usado en 01.1 (bold), 01.2 (italic), 04 (h3 -> p)
def replaceTag(tag1, att1={}, tag2):
    for t in soup.findAll(tag1, att1):
        newTag = Tag(soup, tag2)
        newTag.insert(0, t.renderContents())
        t.replaceWith(newTag)
 
# Elimina un tag, preservando el contenido
# Usado en 03.1 (quitar span), 03.2 (quitar div)
def remove Tag(tag, att={}):
    for t in soup.findAll(tag, att):
        t.replaceWith(t.renderContents())
 
# Elimina un atributo
# Usado en 06 (quitar class)
def removeAttribute(att):
    for t in soup(attrs={att: re.compile('.+')}):
        del(t[att])
 
# Cambia un atributo por otro
# Usado en 05 (align="middle")
def changeAttribute():
    for t in soup(attrs={}):
        del(t[])
        t[] = 
 
# Antepone...
def prepend(tag, attrs={}, text):
    for t in soup(tag, attrs):
 
# Wraps an element with another
def wrapElement(el, wrapperTag):
    wrapper = Tag(soup, wrapperTag)  # e.g. 'p'
    pos = el.parent.contents.index(el)
    el.parent.insert(pos, wrapper)
    wrapper.append(el)

Para usar Tidy, necesitamos instalar TidyLib más el wrapper para Python (µTidylib)

Script:

import os, re
from BeautifulSoup import BeautifulSoup, Tag, Comment
 
def remove_comments():
  comments = soup.findAll(text=lambda text:isinstance(text, Comment))
  [comment.extract() for comment in comments]
 
def remove_tags(tagList):
  for tag in tagList:
    [t.extract() for t in soup(tag)]
 
def clean_bold():
  replaceTag('span', {'class': re.compile('cmb.+')}, 'b')
 
def clean_italics():
  replaceTag('span', {'class': re.compile('cmti.+')}, 'i')
 
def wrap_tables():
  for t in soup('table', attrs={'class': 'equation'}): # REVISAR
    wrapElement(t, 'p')
 
def add_label_width():
 
def remove_spans():
  removeTag('span')
 
def remove_divs():
  removeTag('div')
 
def remove_h3():
 
def remove_class():
  removeAttribute('class')
 
def add_font():
 
def tidy():
 
def remove_brackets():
 
def clean(file):
  remove_comments()
  remove_tags(['meta', 'link'])
  clean_bold()
  clean_italics()
  wrap_tables()
  add_label_width()
  remove_spans()
  remove_divs()
  remove_h3()
  remove_class()
  add_font()
  tidy()
  remove_brackets()
 
for file in os.listdir('.'):
  clean(file)