summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2013-05-26 21:52:41 +0100
committerNick White <git@njw.me.uk>2013-05-26 21:52:41 +0100
commit5d0c5974e2655a6f66153bffb67c9346c2c4a589 (patch)
tree5f0f2d773dda4bb5f7078c0971cc92cf5c46000d
parentb0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 (diff)
downloadoed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.tar.bz2
oed2dict-5d0c5974e2655a6f66153bffb67c9346c2c4a589.zip
Correct e tag removal, correct headword separation
-rw-r--r--BUGS6
-rw-r--r--Makefile5
-rw-r--r--htmtojargon.awk5
-rw-r--r--removeetags.pl4
-rw-r--r--separateheadmeta.sed2
-rw-r--r--xmlcleanup2.sed6
6 files changed, 12 insertions, 16 deletions
diff --git a/BUGS b/BUGS
index 1426aab..7358117 100644
--- a/BUGS
+++ b/BUGS
@@ -1,9 +1,3 @@
-There is a bug causing some words to not be included at all. Cowboy and chicken are examples.
-
-Word type (noun, adj, verb, etc.) markings after headword are included in the index, so searching for e.g. "A" or "n" is impossible as it returns all adjectives or nouns.
-
Many of the IPA and other non-ascii symbols aren't yet converted to unicode, and are just discarded. Many (but not all) of these symbols are documented in files in Help/ from the original CDs.
Search results are not necessarily returned in order. This is probably unavoidable with the dict protocol.
-
-grep TODO symbols *sed *awk Makefile
diff --git a/Makefile b/Makefile
index 10c4fb3..cfb4484 100644
--- a/Makefile
+++ b/Makefile
@@ -11,9 +11,10 @@ all: oed.jargon oed.dict.dz
.htm.jargon:
iconv -f ISO-8859-1 -t UTF-8 < "$<" \
+ | sed 's/\r//g' \
| awk -f htmtojargon.awk \
- | sed -f separateheadmeta.sed \
| sed -f symbols.sed \
+ | perl -p removeetags.pl \
| sed -f xmlcleanup.sed \
| sed -f xmlcleanup2.sed \
> "$@"
@@ -28,7 +29,7 @@ oed.dict: oed.jargon
cat $< \
| dictfmt -j --utf8 \
--columns 0 --headword-separator ',' \
- --index-data-separator ';' \
+ --index-data-separator " " \
-u http://njw.me.uk/oed \
-s "Oxford English Dictionary, 2nd Edition" \
oed
diff --git a/htmtojargon.awk b/htmtojargon.awk
index 16c5356..323b7ee 100644
--- a/htmtojargon.awk
+++ b/htmtojargon.awk
@@ -1,5 +1,6 @@
#!/usr/bin/awk -f
# dirty xml reading is more fun
+# requires nawk / gawk for sub()
BEGIN {
FS = "</hg>"
@@ -8,6 +9,10 @@ BEGIN {
{ printdefs($1, $2); }
function printdefs(word, defs) {
+ # split the headwords and other information about the word,
+ # to be used in conjuction with dictfmt's --index-data-separator
+ sub("<i>", " ", word);
+
n = split(defs, array, "<def>");
for(i=0; i<=n; i++) {
if(array[i] != "") {
diff --git a/removeetags.pl b/removeetags.pl
new file mode 100644
index 0000000..8de7875
--- /dev/null
+++ b/removeetags.pl
@@ -0,0 +1,4 @@
+# <e> tags seem to duplicate other preceeding tags, so remove them
+#
+# sed doesn't support non-greedy matching, so we're using perl
+s/<e>.*?<\/e>//g
diff --git a/separateheadmeta.sed b/separateheadmeta.sed
deleted file mode 100644
index 9dca6c6..0000000
--- a/separateheadmeta.sed
+++ /dev/null
@@ -1,2 +0,0 @@
-# this replaces the first <i> in a headword with "; "
-/^:[^:]*:/ s/<i>/; /
diff --git a/xmlcleanup2.sed b/xmlcleanup2.sed
index 5352a92..56e66b0 100644
--- a/xmlcleanup2.sed
+++ b/xmlcleanup2.sed
@@ -1,12 +1,6 @@
#!/bin/sed -f
# sed rules that must run after the main set
-# <e> tags seem to duplicate <v> tags for some quotes
-# NOTE this assumes e tags only come at end of lines
-s/<e>.*$//g
-# TODO: make this work instead of the above
-#s/<e>.*?<\/e>//g
-
# any xml tags not processed can just go away
s/<[^>]*>//g
# any xml character entities not processed can just go away