← Back to team overview

zim-wiki team mailing list archive

Proof of Concept markdown export for zim

 

Dear all,

Attached a quick and dirty proof of concept for converting zim
formatting into markdown with pandoc extensions. This script takes a
file name for a zim page as argument and outputs markdown to stdout.
If the people interested in this could go ahead and test it I can
include it in the next version of zim as an export option. The script
requires zim to be installed, or at least to have zim modules in the
path.

To test it I suggest:
1) Create a page in the zim GUI with lots of markup
2) Convert it to markdown using this script and check if it looks as expected
3) Convert to html/pdf/... with pandoc and check again if it looks as expected

Feel free to mail any test results to me personally or put a bug in
the zim bug tracker.


Open issues & questions:

1) How to deal with checkbox lists ? Current I leave them in the same
syntax as zim (using "[ ]", "[*]" and "[x]") but this will not
translate well in pandoc's interpretation

2) How should we link to other files when exporting a notebook ? I can
link the file as "file:///path/to/file.txt" or as "./file.txt"

3) How to deal with underline / mark formatting ? Now rendering as
bold, which is acceptable for the time being

4) How to deal with indented paragraphs in zim ? Now ignoring any
indenting, because it would turn paragraphs into verbatim in markdown

5) How to deal with image properties, like setting width and height ?

6) How to deal with with tags (and in the future anchors) ? For now
will just put them in as plain text

7) How to deal with inline objects ? (This is a future feature being
tested, allows plugins to do custom rendering for a block of data.)
Probably can re-use syntax for verbatim with code highlighting
properties - e.g. ask to highlight as "x-zim-plugin-foo". This will be
transparent as verbatim text if not supported.


For proper export at least item 1) and 2) need to be addressed. Other
can be ignored because there is at least a fallback behavior that will
not mess up parsing in pandoc, and export is a one-way process.

However for native support in zim (which means storing all pages
directly in markdown) also 3) to 7) need to be addressed to make sure
we can write a page and then read it again getting the exact same
content. (And probable I forgot a few, but this is what I saw so far.)

Regards,

Jaap
# -*- coding: utf-8 -*-

# Copyright 2012 Jaap Karssenberg <jaap.karssenberg@xxxxxxxxx>

'''This module handles parsing and dumping wiki text'''

# OPEN ISSUES
# - how to deal with checkboxes ?
#   now leavin as is 
# - how to deal with underline ?
#   now rendering as bold
# - how to deal with indented paragraphs ?
#   in pandoc indent is verbatim
#   so now all indent is dropped
# - how to deal with image re-size ?
# - how to deal with tags / anchors ?

# TODO
# - check does zim always produce a blank line before a heading ?
# - links are not resolved at the moment -- need export linker
# - add \ before line ends to match line breaks from user

import re

from zim.formats import *
from zim.parsing import Re, TextBuffer, url_re


#info = {
#	'name':  'Wiki text',
#	'mime':  'text/x-zim-wiki',
#	'read':	  True,
#	'write':  True,
#	'import': True,
#	'export': True,
#}

TABSTOP = 4
bullet_re = u'[\\*\u2022]|\\[[ \\*x]\\]'
	# bullets can be '*' or 0x2022 for normal items
	# and '[ ]', '[*]' or '[x]' for checkbox items

bullets = {
	'[ ]': UNCHECKED_BOX,
	'[x]': XCHECKED_BOX,
	'[*]': CHECKED_BOX,
	'*': BULLET,
}
# reverse dict
bullet_types = {}
for bullet in bullets:
	bullet_types[bullets[bullet]] = bullet


dumper_tags = {
	'emphasis': '*',
	'strong':   '**',
	'mark':     '__', # OPEN ISSUE: not availalbe in pandoc
	'strike':   '~~',
	'code':     '``',
	'sub':      '~',
	'sup':      '^',
	'tag':      '', # No additional annotation (apart from the visible @)
}


class Dumper(DumperClass):

	def dump(self, tree):
		#~ print 'DUMP WIKI', tree.tostring()
		assert isinstance(tree, ParseTree)
		output = TextBuffer()
		self.dump_children(tree.getroot(), output)
		return output.get_lines(end_with_newline=not tree.ispartial)

	def dump_children(self, list, output, list_level=-1):
		if list.text:
			output.append(list.text)

		for element in list.getchildren():
			if element.tag in ('p', 'div'):
				indent = 0
				if 'indent' in element.attrib:
					indent = int(element.attrib['indent'])
				myoutput = TextBuffer()
				self.dump_children(element, myoutput) # recurs
				# OPEN ISSUE: no indent for para 
				#if indent:
				#	myoutput.prefix_lines('\t'*indent)
				output.extend(myoutput)
			elif element.tag == 'h':
				level = int(element.attrib['level'])
				if level < 1:   level = 1
				elif level > 5: level = 5

				if level in (1, 2):
					# setext-style headers for lvl 1 & 2 
					if level == 1: char = '='
					else: char = '-'
					heading = element.text
					line = char * len(heading)
					output.append(heading + '\n')
					output.append(line)
				else:
					# atx-style headers for deeper levels
					tag = '#' * level
					output.append(tag + ' ' + element.text)
			elif element.tag == 'ul':
				indent = 0
				if 'indent' in element.attrib:
					indent = int(element.attrib['indent'])
				myoutput = TextBuffer()
				self.dump_children(element, myoutput, list_level=list_level+1) # recurs
				# OPEN ISSUE: no indent for para 
				#if indent:
				#	myoutput.prefix_lines('\t'*indent)
				output.extend(myoutput)
			elif element.tag == 'li':
				if 'indent' in element.attrib:
					list_level = int(element.attrib['indent'])
				if 'bullet' in element.attrib:
					bullet = bullet_types[element.attrib['bullet']]
				else:
					bullet = '*'
				output.append('\t'*list_level+bullet+' ')
				self.dump_children(element, output, list_level=list_level) # recurs
				output.append('\n')
			elif element.tag == 'pre':
				indent = 0
				if 'indent' in element.attrib:
					indent = int(element.attrib['indent'])
				myoutput = TextBuffer()
				myoutput.append(element.text)
				# OPEN ISSUE: no indent for para 
				#if indent:
				#	myoutput.prefix_lines('\t'*indent)
				myoutput.prefix_lines('\t') # verbatim is always indented
				output.extend(myoutput)
			elif element.tag == 'link':
				assert 'href' in element.attrib, \
					'BUG: link %s "%s"' % (element.attrib, element.text)
				href = element.attrib['href']
				text = element.text or href
				if href == text and url_re.match(href):
					output.append('<' + href + '>')
				else:
					output.append('[%s](%s)' % (text, href))
			elif element.tag == 'img':
				src = element.attrib['src']
				# OPEN ISSUE: image properties used in zim not supported in pandoc
				#opts = []
				#items = element.attrib.items()
				# we sort params only because unit tests don't like random output
				#items.sort()
				#for k, v in items:
				#	if k == 'src' or k.startswith('_'):
				#		continue
				#	elif v: # skip None, "" and 0
				#		opts.append('%s=%s' % (k, v))
				#if opts:
				#	src += '?%s' % '&'.join(opts)
							
				text = element.text or ''		
				output.append('![%s](%s)' % (text, src))
			elif element.tag in dumper_tags:
				if element.text:
					tag = dumper_tags[element.tag]
					output.append(tag + element.text + tag)
			else:
				assert False, 'Unknown node type: %s' % element

			if element.tail:
				output.append(element.tail)


def convert_file(file):
	from zim.fs import File	
	from zim.formats.wiki import Parser
	file = File(file)
	parser = Parser()
	tree = parser.parse(file.read())
	dumper = Dumper()
	text = dumper.dump(tree)
	return text


if __name__ == '__main__':
	import sys
	import os

	assert len(sys.argv) == 2, "Usage: markdown.py FILE"

	if os.path.isfile(sys.argv[1]):
		lines = convert_file(sys.argv[1])
		print ''.join(lines)
	else:
		print 'Not a file:'. sys.argv[1]
		
	# TODO hook real notebook and exporter with linker
	

Follow ups