6 files changed, 12 insertions, 16 deletions
diff --git a/BUGS b/BUGS
index 1426aab..7358117 100644
--- a/BUGS
+++ b/BUGS
@@ -1,9 +1,3 @@
-There is a bug causing some words to not be included at all. Cowboy and chicken are examples.
-
-Word type (noun, adj, verb, etc.) markings after headword are included in the index, so searching for e.g. "A" or "n" is impossible as it returns all adjectives or nouns.
-
 Many of the IPA and other non-ascii symbols aren't yet converted to unicode, and are just discarded. Many (but not all) of these symbols are documented in files in Help/ from the original CDs.
 
 Search results are not necessarily returned in order. This is probably unavoidable with the dict protocol.
-
-grep TODO symbols *sed *awk Makefile
diff --git a/Makefile b/Makefile
index 10c4fb3..cfb4484 100644
--- a/Makefile
+++ b/Makefile
@@ -11,9 +11,10 @@ all: oed.jargon oed.dict.dz
 
 .htm.jargon:
 	iconv -f ISO-8859-1 -t UTF-8 < "$<" \
+	| sed 's/\r//g' \
 	| awk -f htmtojargon.awk \
-	| sed -f separateheadmeta.sed \
 	| sed -f symbols.sed \
+	| perl -p removeetags.pl \
 	| sed -f xmlcleanup.sed \
 	| sed -f xmlcleanup2.sed \
 	> "$@"
@@ -28,7 +29,7 @@ oed.dict: oed.jargon
 	cat $< \
 	| dictfmt -j --utf8 \
 	  --columns 0 --headword-separator ',' \
-	  --index-data-separator ';' \
+	  --index-data-separator "	" \
 	  -u http://njw.me.uk/oed \
 	  -s "Oxford English Dictionary, 2nd Edition" \
 	  oed
diff --git a/htmtojargon.awk b/htmtojargon.awk
index 16c5356..323b7ee 100644
--- a/htmtojargon.awk
+++ b/htmtojargon.awk
@@ -1,5 +1,6 @@
 #!/usr/bin/awk -f
 # dirty xml reading is more fun
+# requires nawk / gawk for sub()
 
 BEGIN {
 	FS = "</hg>"
@@ -8,6 +9,10 @@ BEGIN {
 { printdefs($1, $2); }
 
 function printdefs(word, defs) {
+	# split the headwords and other information about the word,
+	# to be used in conjuction with dictfmt's --index-data-separator
+	sub("<i>", "	", word);
+
 	n = split(defs, array, "<def>");
 	for(i=0; i<=n; i++) {
 		if(array[i] != "") {
diff --git a/removeetags.pl b/removeetags.pl
new file mode 100644
index 0000000..8de7875
--- /dev/null
+++ b/removeetags.pl
@@ -0,0 +1,4 @@
+# <e> tags seem to duplicate other preceeding tags, so remove them
+#
+# sed doesn't support non-greedy matching, so we're using perl
+s/<e>.*?<\/e>//g
diff --git a/separateheadmeta.sed b/separateheadmeta.sed
deleted file mode 100644
index 9dca6c6..0000000
--- a/separateheadmeta.sed
+++ /dev/null
@@ -1,2 +0,0 @@
-# this replaces the first <i> in a headword with "; "
-/^:[^:]*:/ s/<i>/; /
diff --git a/xmlcleanup2.sed b/xmlcleanup2.sed
index 5352a92..56e66b0 100644
--- a/xmlcleanup2.sed
+++ b/xmlcleanup2.sed
@@ -1,12 +1,6 @@
 #!/bin/sed -f
 # sed rules that must run after the main set
 
-# <e> tags seem to duplicate <v> tags for some quotes
-# NOTE this assumes e tags only come at end of lines
-s/<e>.*$//g
-# TODO: make this work instead of the above
-#s/<e>.*?<\/e>//g
-
 # any xml tags not processed can just go away
 s/<[^>]*>//g
 # any xml character entities not processed can just go away