#!/usr/bin/python import re, md5, sys, string """markdown.py: A Markdown-styled-text to HTML converter in Python. Usage: ./markdown.py textfile.markdown Calling: import markdown somehtml = markdown.markdown(sometext) """ __version__ = '1.0.1' # port of 1.0.1 __license__ = "GNU GPL 2" __author__ = [ 'John Gruber ', 'Tollef Fog Heen ', 'Aaron Swartz ' ] def semirandom(seed): x = 0 for c in md5.new(seed).digest(): x += ord(c) return x / (255*16.) class _Markdown: emptyelt = " />" tabwidth = 4 escapechars = '\\`*_{}[]()>#+-.!' escapetable = {} for char in escapechars: escapetable[char] = md5.new(char).hexdigest() r_multiline = re.compile("\n{2,}") r_stripspace = re.compile(r"^[ \t]+$", re.MULTILINE) def parse(self, text): self.urls = {} self.titles = {} self.html_blocks = {} self.list_level = 0 text = text.replace("\r\n", "\n") text = text.replace("\r", "\n") text += "\n\n" text = self._Detab(text) text = self.r_stripspace.sub("", text) text = self._HashHTMLBlocks(text) text = self._StripLinkDefinitions(text) text = self._RunBlockGamut(text) text = self._UnescapeSpecialChars(text) return text r_StripLinkDefinitions = re.compile(r""" ^[ ]{0,%d}\[(.+)\]: # id = $1 [ \t]*\n?[ \t]* ? # url = $2 [ \t]*\n?[ \t]* (?: (?<=\s) # lookbehind for whitespace [\"\(] # " is backlashed so it colorizes our code right (.+?) # title = $3 [\"\)] [ \t]* )? # title is optional (?:\n+|\Z) """ % (tabwidth-1), re.MULTILINE|re.VERBOSE) def _StripLinkDefinitions(self, text): def replacefunc(matchobj): (t1, t2, t3) = matchobj.groups() #@@ case sensitivity? self.urls[t1.lower()] = self._EncodeAmpsAndAngles(t2) if t3 is not None: self.titles[t1.lower()] = t3.replace('"', '"') return "" text = self.r_StripLinkDefinitions.sub(replacefunc, text) return text blocktagsb = r"p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|math" blocktagsa = blocktagsb + "|ins|del" r_HashHTMLBlocks1 = re.compile(r""" ( # save in $1 ^ # start of line (with /m) <(%s) # start tag = $2 \b # word break (.*\n)*? # any number of lines, minimally matching # the matching end tag [ \t]* # trailing spaces/tabs (?=\n+|$) # followed by a newline or end of document ) """ % blocktagsa, re.MULTILINE | re.VERBOSE) r_HashHTMLBlocks2 = re.compile(r""" ( # save in $1 ^ # start of line (with /m) <(%s) # start tag = $2 \b # word break (.*\n)*? # any number of lines, minimally matching .* # the matching end tag [ \t]* # trailing spaces/tabs (?=\n+|\Z) # followed by a newline or end of document ) """ % blocktagsb, re.MULTILINE | re.VERBOSE) r_HashHR = re.compile(r""" (?: (?<=\n\n) # Starting after a blank line | # or \A\n? # the beginning of the doc ) ( # save in $1 [ ]{0,%d} <(hr) # start tag = $2 \b # word break ([^<>])*? # /?> # the matching end tag [ \t]* (?=\n{2,}|\Z)# followed by a blank line or end of document ) """ % (tabwidth-1), re.VERBOSE) r_HashComment = re.compile(r""" (?: (?<=\n\n) # Starting after a blank line | # or \A\n? # the beginning of the doc ) ( # save in $1 [ ]{0,%d} (?: ) [ \t]* (?=\n{2,}|\Z)# followed by a blank line or end of document ) """ % (tabwidth-1), re.VERBOSE) def _HashHTMLBlocks(self, text): def handler(m): key = md5.new(m.group(1)).hexdigest() self.html_blocks[key] = m.group(1) return "\n\n%s\n\n" % key text = self.r_HashHTMLBlocks1.sub(handler, text) text = self.r_HashHTMLBlocks2.sub(handler, text) oldtext = text text = self.r_HashHR.sub(handler, text) text = self.r_HashComment.sub(handler, text) return text #@@@ wrong! r_hr1 = re.compile(r'^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$', re.M) r_hr2 = re.compile(r'^[ ]{0,2}([ ]?-[ ]?){3,}[ \t]*$', re.M) r_hr3 = re.compile(r'^[ ]{0,2}([ ]?_[ ]?){3,}[ \t]*$', re.M) def _RunBlockGamut(self, text): text = self._DoHeaders(text) for x in [self.r_hr1, self.r_hr2, self.r_hr3]: text = x.sub("\ns. text = self._HashHTMLBlocks(text) text = self._FormParagraphs(text) return text r_NewLine = re.compile(" {2,}\n") def _RunSpanGamut(self, text): text = self._DoCodeSpans(text) text = self._EscapeSpecialChars(text) text = self._DoImages(text) text = self._DoAnchors(text) text = self._DoAutoLinks(text) text = self._EncodeAmpsAndAngles(text) text = self._DoItalicsAndBold(text) text = self.r_NewLine.sub(" ? # href = $3 [ \t]* ( # $4 ([\'\"]) # quote char = $5 (.*?) # Title = $6 \5 # matching quote )? # title is optional \) ) """, re.S|re.VERBOSE) def _DoAnchors(self, text): # We here don't do the same as the perl version, as python's regex # engine gives us no way to match brackets. def handler1(m): whole_match = m.group(1) link_text = m.group(2) link_id = m.group(3).lower() if not link_id: link_id = link_text.lower() title = self.titles.get(link_id, None) if self.urls.has_key(link_id): url = self.urls[link_id] url = url.replace("*", self.escapetable["*"]) url = url.replace("_", self.escapetable["_"]) res = '? # src url = $3 [ \t]* ( # $4 ([\'\"]) # quote char = $5 (.*?) # title = $6 \5 # matching quote [ \t]* )? # title is optional \) ) """, re.VERBOSE|re.S) def _DoImages(self, text): def handler1(m): whole_match = m.group(1) alt_text = m.group(2) link_id = m.group(3).lower() if not link_id: link_id = alt_text.lower() alt_text = alt_text.replace('"', """) if self.urls.has_key(link_id): url = self.urls[link_id] url = url.replace("*", self.escapetable["*"]) url = url.replace("_", self.escapetable["_"]) res = '''%s= len(textl): continue count = textl[i].strip().count(c) if count > 0 and count == len(textl[i].strip()) and textl[i+1].strip() == '' and textl[i-1].strip() != '': textl = textl[:i] + textl[i+1:] textl[i-1] = ''+self._RunSpanGamut(textl[i-1])+'' textl = textl[:i] + textl[i+1:] text = '\n'.join(textl) return text def handler(m): level = len(m.group(1)) header = self._RunSpanGamut(m.group(2)) return "%s\n\n" % (level, header, level) text = findheader(text, '=', '1') text = findheader(text, '-', '2') text = self.r_DoHeaders.sub(handler, text) return text rt_l = r""" ( ( [ ]{0,%d} ([*+-]|\d+[.]) [ \t]+ ) (?:.+?) ( \Z | \n{2,} (?=\S) (?![ \t]* ([*+-]|\d+[.])[ \t]+) ) ) """ % (tabwidth - 1) r_DoLists = re.compile('^'+rt_l, re.M | re.VERBOSE | re.S) r_DoListsTop = re.compile( r'(?:\A\n?|(?<=\n\n))'+rt_l, re.M | re.VERBOSE | re.S) def _DoLists(self, text): def handler(m): list_type = "ol" if m.group(3) in [ "*", "-", "+" ]: list_type = "ul" listn = m.group(1) listn = self.r_multiline.sub("\n\n\n", listn) res = self._ProcessListItems(listn) res = "<%s>\n%s\n" % (list_type, res, list_type) return res if self.list_level: text = self.r_DoLists.sub(handler, text) else: text = self.r_DoListsTop.sub(handler, text) return text r_multiend = re.compile(r"\n{2,}\Z") r_ProcessListItems = re.compile(r""" (\n)? # leading line = $1 (^[ \t]*) # leading whitespace = $2 ([*+-]|\d+[.]) [ \t]+ # list marker = $3 ((?:.+?) # list item text = $4 (\n{1,2})) (?= \n* (\Z | \2 ([*+-]|\d+[.]) [ \t]+)) """, re.VERBOSE | re.M | re.S) def _ProcessListItems(self, text): self.list_level += 1 text = self.r_multiend.sub("\n", text) def handler(m): item = m.group(4) leading_line = m.group(1) leading_space = m.group(2) if leading_line or self.r_multiline.search(item): item = self._RunBlockGamut(self._Outdent(item)) else: item = self._DoLists(self._Outdent(item)) if item[-1] == "\n": item = item[:-1] # chomp item = self._RunSpanGamut(item) return "
  • %s
  • \n" % item text = self.r_ProcessListItems.sub(handler, text) self.list_level -= 1 return text r_DoCodeBlocks = re.compile(r""" (?:\n\n|\A) ( # $1 = the code block (?: (?:[ ]{%d} | \t) # Lines must start with a tab or equiv .*\n+ )+ ) ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space/end of doc """ % (tabwidth, tabwidth), re.M | re.VERBOSE) def _DoCodeBlocks(self, text): def handler(m): codeblock = m.group(1) codeblock = self._EncodeCode(self._Outdent(codeblock)) codeblock = self._Detab(codeblock) codeblock = codeblock.lstrip("\n") codeblock = codeblock.rstrip() res = "\n\n
    %s\n
    \n\n" % codeblock return res text = self.r_DoCodeBlocks.sub(handler, text) return text r_DoCodeSpans = re.compile(r""" (`+) # $1 = Opening run of ` (.+?) # $2 = The code block (?%s" % c text = self.r_DoCodeSpans.sub(handler, text) return text def _EncodeCode(self, text): text = text.replace("&","&") text = text.replace("<","<") text = text.replace(">",">") for c in "*_{}[]\\": text = text.replace(c, self.escapetable[c]) return text r_DoBold = re.compile(r"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1", re.VERBOSE | re.S) r_DoItalics = re.compile(r"(\*|_) (?=\S) (.+?) (?<=\S) \1", re.VERBOSE | re.S) def _DoItalicsAndBold(self, text): text = self.r_DoBold.sub(r"\2", text) text = self.r_DoItalics.sub(r"\2", text) return text r_start = re.compile(r"^", re.M) r_DoBlockQuotes1 = re.compile(r"^[ \t]*>[ \t]?", re.M) r_DoBlockQuotes2 = re.compile(r"^[ \t]+$", re.M) r_DoBlockQuotes3 = re.compile(r""" ( # Wrap whole match in $1 ( ^[ \t]*>[ \t]? # '>' at the start of a line .+\n # rest of the first line (.+\n)* # subsequent consecutive lines \n* # blanks )+ )""", re.M | re.VERBOSE) r_protectpre = re.compile(r'(\s*
    .+?
    )', re.S) r_propre = re.compile(r'^ ', re.M) def _DoBlockQuotes(self, text): def prehandler(m): return self.r_propre.sub('', m.group(1)) def handler(m): bq = m.group(1) bq = self.r_DoBlockQuotes1.sub("", bq) bq = self.r_DoBlockQuotes2.sub("", bq) bq = self._RunBlockGamut(bq) bq = self.r_start.sub(" ", bq) bq = self.r_protectpre.sub(prehandler, bq) return "
    \n%s\n
    \n\n" % bq text = self.r_DoBlockQuotes3.sub(handler, text) return text r_tabbed = re.compile(r"^([ \t]*)") def _FormParagraphs(self, text): text = text.strip("\n") grafs = self.r_multiline.split(text) for g in xrange(len(grafs)): t = grafs[g].strip() #@@? if not self.html_blocks.has_key(t): t = self._RunSpanGamut(t) t = self.r_tabbed.sub(r"

    ", t) t += "

    " grafs[g] = t for g in xrange(len(grafs)): t = grafs[g].strip() if self.html_blocks.has_key(t): grafs[g] = self.html_blocks[t] return "\n\n".join(grafs) r_EncodeAmps = re.compile(r"&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)") r_EncodeAngles = re.compile(r"<(?![a-z/?\$!])") def _EncodeAmpsAndAngles(self, text): text = self.r_EncodeAmps.sub("&", text) text = self.r_EncodeAngles.sub("<", text) return text def _EncodeBackslashEscapes(self, text): for char in self.escapechars: text = text.replace("\\" + char, self.escapetable[char]) return text r_link = re.compile(r"<((https?|ftp):[^\'\">\s]+)>", re.I) r_email = re.compile(r""" < (?:mailto:)? ( [-.\w]+ \@ [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ ) >""", re.VERBOSE|re.I) def _DoAutoLinks(self, text): text = self.r_link.sub(r'
    \1', text) def handler(m): l = m.group(1) return self._EncodeEmailAddress(self._UnescapeSpecialChars(l)) text = self.r_email.sub(handler, text) return text r_EncodeEmailAddress = re.compile(r">.+?:") def _EncodeEmailAddress(self, text): encode = [ lambda x: "&#%s;" % ord(x), lambda x: "&#x%X;" % ord(x), lambda x: x ] text = "mailto:" + text addr = "" for c in text: if c == ':': addr += c; continue r = semirandom(addr) if r < 0.45: addr += encode[1](c) elif r > 0.9 and c != '@': addr += encode[2](c) else: addr += encode[0](c) text = '%s' % (addr, addr) text = self.r_EncodeEmailAddress.sub('>', text) return text def _UnescapeSpecialChars(self, text): for key in self.escapetable.keys(): text = text.replace(self.escapetable[key], key) return text tokenize_depth = 6 tokenize_nested_tags = '|'.join([r'(?:<[a-z/!$](?:[^<>]'] * tokenize_depth) + (')*>)' * tokenize_depth) r_TokenizeHTML = re.compile( r"""(?: ) | # comment (?: <\? .*? \?> ) | # processing instruction %s # nested tags """ % tokenize_nested_tags, re.I|re.VERBOSE) def _TokenizeHTML(self, text): pos = 0 tokens = [] matchobj = self.r_TokenizeHTML.search(text, pos) while matchobj: whole_tag = matchobj.string[matchobj.start():matchobj.end()] sec_start = matchobj.end() tag_start = sec_start - len(whole_tag) if pos < tag_start: tokens.append(["text", matchobj.string[pos:tag_start]]) tokens.append(["tag", whole_tag]) pos = sec_start matchobj = self.r_TokenizeHTML.search(text, pos) if pos < len(text): tokens.append(["text", text[pos:]]) return tokens r_Outdent = re.compile(r"""^(\t|[ ]{1,%d})""" % tabwidth, re.M) def _Outdent(self, text): text = self.r_Outdent.sub("", text) return text def _Detab(self, text): return text.expandtabs(self.tabwidth) def Markdown(*args, **kw): return _Markdown().parse(*args, **kw) markdown = Markdown if __name__ == '__main__': if len(sys.argv) > 1: print Markdown(open(sys.argv[1]).read()) else: print Markdown(sys.stdin.read())