Re: Searching a string and extract all occurancies of a substring




Try Beautiful Soup, or if your input is simple enough, the re module.

Hi Gabriel,

I first tried "HTMLParser" and wrote this short script:

from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs

class MyDocParser(HTMLParser):

def __init__(self):
self.paths = []
self.readingpaths = 0 # flag
HTMLParser.__init__(self)

def handle_starttag(self, tag, attrs):
if tag == 'parameter':
self.readingpaths = 1

def handle_endtag(self, tag):
if tag == 'parameter':
self.readingpaths = 0

def handle_data(self, data):
if self.readingpaths:
self.paths.append(data)

def handle_entityref(self, name):
" handle values like 'Home & Products' "
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&' + name + ';')

def handle_charref(self, name):
""" handle values like 'Home & Products®'
Ignores invalid character references
"""
try:
charnum = int(name)
except ValueError:
return

if charnum < 1 or charnum > 255:
return

def get_paths(self):
return self.paths


def parse_content(content):
""" parse
"""

parser = MyDocParser()
parser.feed(content)

paths = parser.get_paths()

return paths

# /end

This works as long as there are no other <paramter> Tags in the content that I parse.


Nico
.



Relevant Pages