From 2742524159ad2bc861711df084f6bd77588e9e9b Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 31 Mar 2013 18:32:08 +0100 Subject: Initial commit --- BUGS | 5 +++++ Makefile | 40 +++++++++++++++++++++++++++++++++++ htmtojargon.awk | 17 +++++++++++++++ symbols | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ symbolstosed.awk | 6 ++++++ xmlcleanup.sed | 39 ++++++++++++++++++++++++++++++++++ xmlcleanup2.sed | 13 ++++++++++++ 7 files changed, 184 insertions(+) create mode 100644 BUGS create mode 100644 Makefile create mode 100644 htmtojargon.awk create mode 100644 symbols create mode 100644 symbolstosed.awk create mode 100644 xmlcleanup.sed create mode 100644 xmlcleanup2.sed diff --git a/BUGS b/BUGS new file mode 100644 index 0000000..01a1916 --- /dev/null +++ b/BUGS @@ -0,0 +1,5 @@ +Noun, adj / verb (and similar prob) markings after headword are searched against in the index. so searching for "A" or "n" are impossible as they return all adjectives / nouns. + +Search results are not returned in order. May be a limitation of the dict protocol. + +See TODO notes in scripts diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..56e318d --- /dev/null +++ b/Makefile @@ -0,0 +1,40 @@ +# requires dictfmt and dictzip tools (from the package dictfmt) + +PREFIX = /usr + +SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \ + 6.htm 7.htm 8.htm 9.htm 10.htm +JARGONS = $(SRC:.htm=.jargon) + +$(JARGONS): symbols.sed + +all: oed.dict.dz + +.htm.jargon: + awk -f htmtojargon.awk < "$<" \ + | sed -f symbols.sed \ + | sed -f xmlcleanup.sed \ + | sed -f xmlcleanup2.sed \ + > "$@" + +symbols.sed: symbols symbolstosed.awk + awk -f symbolstosed.awk < symbols > $@ + +oed.dict: $(JARGONS) + cat $(JARGONS) \ + | dictfmt -j --utf8 \ + --columns 0 --headword-separator ' ' \ + -u http://njw.me.uk/oed \ + -s "Oxford English Dictionary Second Edition v3" \ + oed + +oed.dict.dz: oed.dict + dictzip -k $< + +install: all + cp oed.dict.dz oed.index $(DESTDIR)$(PREFIX)/share/dictd/ + test -x /usr/sbin/dictdconfig && dictdconfig -w + test -x /etc/init.d/dictd && /etc/init.d/dictd restart + +.SUFFIXES: .htm .jargon .sed .dict .dz +.PHONY: all install diff --git a/htmtojargon.awk b/htmtojargon.awk new file mode 100644 index 0000000..16c5356 --- /dev/null +++ b/htmtojargon.awk @@ -0,0 +1,17 @@ +#!/usr/bin/awk -f +# dirty xml reading is more fun + +BEGIN { + FS = "" +} + +{ printdefs($1, $2); } + +function printdefs(word, defs) { + n = split(defs, array, ""); + for(i=0; i<=n; i++) { + if(array[i] != "") { + printf(":%s: %s\n", word, array[i]); + } + } +} diff --git a/symbols b/symbols new file mode 100644 index 0000000..797961c --- /dev/null +++ b/symbols @@ -0,0 +1,64 @@ +# All symbols can be represented either as &symbol; or _symbol_ so +# parse these with symbolstosed to create sed rules. + +# A large list of these in Help/Advanced/symbols.htm + +# punctuation +revsc · + +# Ligature digraphs +Ae Æ +ae æ +Oe Œ +oe œ + +# Phonetic symbols +aepr ɶ +ccedpr θ +edhpr ð +fata ɑ +lm ː +ope ɛ +revc ɔ +rfa ɒ +schwa ə +sh ʃ +shti ɨ +shtu ɯ +sm ˈ +smpr ˈ +smm ˌ +smmpr ˌ +zh ʝ + +Abarab Ā + +# These are extras which aren't documented in the symbol list + +th th +Th Th + +Aang Å +ouml ö + +oq ‘ +cq ’ + +times × + +# Basic HTML entities +# TODO: generate these from http://www.w3.org/TR/html4/sgml/entities.html or http://www.w3.org/TR/xml-entity-names/ +\#038 \& +aacu á +agrave à +auml ä +eacu é +egrave è +oacu ó +ocirc ô + +# TODO +# edh - unknown +# ygh - some variant of y +# thbar - probably th with bar above +# asg diff --git a/symbolstosed.awk b/symbolstosed.awk new file mode 100644 index 0000000..6eb41f2 --- /dev/null +++ b/symbolstosed.awk @@ -0,0 +1,6 @@ +#!/usr/bin/awk -f + +/^[^#]/{ + printf("s/_%s_/%s/g\n", $1, $2); + printf("s/&%s;/%s/g\n", $1, $2); +} diff --git a/xmlcleanup.sed b/xmlcleanup.sed new file mode 100644 index 0000000..f302b11 --- /dev/null +++ b/xmlcleanup.sed @@ -0,0 +1,39 @@ +#!/bin/sed -f + +# unfortunately they use latin-1 rather than ascii... +# note \x is a gnu extension +s/\xA3/£/g + +# seemingly unneeded control characters +s/&ff[0-9];//g + +# corresponds to a quote +s//\n\n/g + +# quote text start and end points +s//“/g +s/<\/qt>/”/g + +# new paragraph +s/∥/\n\n/g + +# a date +s///g +s/<\/d>/:/g + +# brackets +s/&obr;/{/g +s/&cbr;/}/g + +# space +s/ / /g + +# superscript +s// /g +s/<\/sup>//g + +# xNNNN codes (unknown what the NNNN refers to; not unicode) +s//`/g +s//'/g +s//-/g +s//../g diff --git a/xmlcleanup2.sed b/xmlcleanup2.sed new file mode 100644 index 0000000..5352a92 --- /dev/null +++ b/xmlcleanup2.sed @@ -0,0 +1,13 @@ +#!/bin/sed -f +# sed rules that must run after the main set + +# tags seem to duplicate tags for some quotes +# NOTE this assumes e tags only come at end of lines +s/.*$//g +# TODO: make this work instead of the above +#s/.*?<\/e>//g + +# any xml tags not processed can just go away +s/<[^>]*>//g +# any xml character entities not processed can just go away +s/&[^;]*;//g -- cgit v1.2.3