====== Procesamiento de archivos con Python ======
En desarrollo
Posibles herramientas:
* [[http://www.crummy.com/software/BeautifulSoup/|Beautiful Soup]]
* [[http://docs.python.org/lib/module-HTMLParser.html|HTMLParser]]
Hay dos tipos de reeemplazos:
* sobre el código fuente: no requieren manejar la estructura HTML
* sobre etiquetas/atributos: requieren (o son más simples con) un parser de HTML
Cómo eliminar los comentarios usando BeautifulSoup:
from BeautifulSoup import BeautifulSoup, Comment
soup = BeautifulSoup("""1
23""")
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
print soup
Cómo detectar ocurrencias de un elemento:
>>> soup = BeautifulSoup(open('v48n2a03.html').read())
>>> soup.findAll(name='link')
[]
>>> meta = soup('meta')
>>> meta
[, , , , ]
>>> [el.extract() for el in meta]
>>> soup('meta')
[]
>>> # O también...
>>> for tag in soup('meta')
... tag.extract()
>>>
>>> [el.extract() for el in soup.findAll(name='link')]
>>>
>>> import re
>>> bold = soup.findAll(name='span', attrs={'class': re.compile('cmbx.+')})
>>> italic = soup.findAll(name='span', attrs={'class': re.compile('cm.i.+')})
Funciones:
# Reemplaza un tag por otro, preservando el contenido
# Usado en 01.1 (bold), 01.2 (italic), 04 (h3 -> p)
def replaceTag(tag1, att1={}, tag2):
for t in soup.findAll(tag1, att1):
newTag = Tag(soup, tag2)
newTag.insert(0, t.renderContents())
t.replaceWith(newTag)
# Elimina un tag, preservando el contenido
# Usado en 03.1 (quitar span), 03.2 (quitar div)
def remove Tag(tag, att={}):
for t in soup.findAll(tag, att):
t.replaceWith(t.renderContents())
# Elimina un atributo
# Usado en 06 (quitar class)
def removeAttribute(att):
for t in soup(attrs={att: re.compile('.+')}):
del(t[att])
# Cambia un atributo por otro
# Usado en 05 (align="middle")
def changeAttribute():
for t in soup(attrs={}):
del(t[])
t[] =
# Antepone...
def prepend(tag, attrs={}, text):
for t in soup(tag, attrs):
# Wraps an element with another
def wrapElement(el, wrapperTag):
wrapper = Tag(soup, wrapperTag) # e.g. 'p'
pos = el.parent.contents.index(el)
el.parent.insert(pos, wrapper)
wrapper.append(el)
Para usar Tidy, necesitamos instalar TidyLib más el wrapper para Python (µTidylib)
Script:
import os, re
from BeautifulSoup import BeautifulSoup, Tag, Comment
def remove_comments():
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
def remove_tags(tagList):
for tag in tagList:
[t.extract() for t in soup(tag)]
def clean_bold():
replaceTag('span', {'class': re.compile('cmb.+')}, 'b')
def clean_italics():
replaceTag('span', {'class': re.compile('cmti.+')}, 'i')
def wrap_tables():
for t in soup('table', attrs={'class': 'equation'}): # REVISAR
wrapElement(t, 'p')
def add_label_width():
def remove_spans():
removeTag('span')
def remove_divs():
removeTag('div')
def remove_h3():
def remove_class():
removeAttribute('class')
def add_font():
def tidy():
def remove_brackets():
def clean(file):
remove_comments()
remove_tags(['meta', 'link'])
clean_bold()
clean_italics()
wrap_tables()
add_label_width()
remove_spans()
remove_divs()
remove_h3()
remove_class()
add_font()
tidy()
remove_brackets()
for file in os.listdir('.'):
clean(file)