# # Sample links dictionary file for HTML::TextToHTML # http://www.katspace.com/tools/text_to_html # http://txt2html.sourceforge.net/ # based on links dictionary for Seth Golub's txt2html # http://www.aigeek.com/txt2html/ # # This dictionary contains some patterns for converting obvious URLs, # ftp sites, hostnames, email addresses and the like to hrefs. # # Original adapted from the html.pl package by Oscar Nierstrasz in # the Software Archive of the Software Composition Group # http://iamwww.unibe.ch/~scg/Src/ # # Mark _underlined stuff_ as underlined stuff # take account of possible trailing punctuation # /([ \t\n])_([a-z][a-z0-9 -]*[a-z])_([ \t\n\.;:,\!\?])/ -hi-> $1$2$3 # Mark _emphasized stuff_ as emphasized stuff /\B_([a-z][a-z -]*[a-z])_\B/ -hi-> $1 # Use this one instead if you want it to match more aggressively. # /_((\w|["'])+(\w|\s|[-!?,:;._\"\'<>#%&=+\/\$]+)+)_/ -hi-> $1 # We also need a special case for _x_ /\B_([a-z])_\B/ -hi-> $1 # Mark *bolded stuff* as bolded stuff /\B\*([a-z0-9][a-z0-9 -_,]*[a-z0-9:])\*\B/ -hi-> $1 /\*([a-z0-9])\*/ -hi-> $1 # Use this one instead if you want it to match more aggressively. # note that this doesn't use \w because we don't want to match numbers # since #1 is a common enough usage. # /\B\*([a-z-_]([a-z-_]|\s|\!|\?|,|;|:|\'|\.)*([a-z-_]|\.|\'|\"|:|\!|\?))\*\B/ -hi-> $1 # We also need a special case for #x# #/\B\*([a-z])\*\B/ -hi-> $1 # Rule for DOQQs |\*([][a-zA-Z.,]+)\*| -hi-> $1 # Rule for Ewok's link generation (for mailto) |["]([^"]+?)["]:mailto:(\S+)| -h-> $1 |["]([^"]+?)["]:mailto:(\S+)| -h-> $1 |<(ftp://.*)>| -h-> $1 |<(http://.*)>| -h-> $1 |<([a-zA-Z.]+\@[a-zA-Z.]+)>| -h-> $1 # Rule for Ewok's link generation (default case) |"([^"]+?)":([-a-zA-Z0-9_./:?&]+)| -h-> $1 # Rule for passing through existing anchor markup # |<a href="([^"]+?)">([^<]+?)</a>| -hi-> $2 # Rule for inserting actual image instead # |<img| -hi-> $& # Some people even like to mark the URL label explicitly /<URL:([-\w\.\/:~_\@]+):([a-zA-Z0-9'() ]+)>/ -h-> $2 # Some people like to mark URLs explicitly /<URL:\s*(\S+?)\s*>/ -h-> $1 # /<(http:\S+?)\s*>/ -h-> <$1> # Urls: : |snews:[\w\.]+| -> $& |news:[\w\.]+| -> $& |nntp:[\w/\.:+\-]+| -> $& |http:[\w/\.:\@+\-~\%#?=&;,]+[\w/]| -> $& |shttp:[\w/\.:+\-~\%#?=&;,]+| -> $& |https:[\w/\.:+\-~\%#?=&;,]+| -> $& |file:[\w/\.:+\-]+| -> $& |ftp:[\w/\.:+\-]+| -> $& |wais:[\w/\.:+\-]+| -> $& |gopher:[\w/\.:+\-]+| -> $& |telnet:[\w/\@\.:+\-]+| -> $& # catch some newsgroups to avoid confusion with sites: |([^\w\-/\.:\@>])(alt\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(bionet\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(bit\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(biz\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(clari\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(comp\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(gnu\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(humanities\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(k12\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(misc\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(news\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(rec\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(soc\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(talk\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(us\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(ch\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 |([^\w\-/\.:\@>])(de\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2 # FTP locations (with directory): # anonymous@: |(anonymous\@)([a-zA-Z][\w\.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/\.]+)| -h-> $1$2:$4$3 # ftp@: |(ftp\@)([a-zA-Z][\w\.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/\.]+)| -h-> $1$2:$4$3 # Email address |[a-zA-Z0-9_\+\-\.]+\@([a-zA-Z0-9][\w\.+\-]+\.[a-zA-Z]{2,})| -> mailto:$& # : |([^\w\-/\.:\@>])([a-zA-Z][\w\.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/\.]+)| -h-> $1$2:$4$3 # NB: don't confuse an http server with a port number for # an FTP location! # internet number version: : |([^\w\-/\.:\@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/\.]+)| -h-> $1$2:$3 # telnet |telnet ([a-zA-Z][\w+\-]+(\.[\w\.+\-]+)+\.[a-zA-Z]{2,})\s+(\d{2,4})| -h-> telnet $1 $3 # ftp |ftp ([a-zA-Z][\w+\-]+(\.[\w\.+\-]+)+\.[a-zA-Z]{2,})| -h-> ftp $1 # host with "ftp" in the machine name |(^|[^\w\d\-/\.:!]|^

)(([a-zA-Z][\w+\-]*)?ftp[\w+\-]*\.[\w\.+\-]+\.[a-zA-Z]{2,})([^\w\d\-/\.:!])| -h-> $1ftp $2$4 # ftp.foo.net/blah/ |ftp(\.[a-zA-Z0-9_\@:-]+)+/\S+| -> ftp://$& # www.thehouse.org/txt2html/ |www(\.[a-zA-Z0-9_\@:-]+)+/\S+| -> http://$& # host with "www" in the machine name |(^|[^\w\d\-/\.:!]|^

)(([a-zA-Z][\w+\-]*)?www[\w+\-]*\.[\w\.+\-]+\.[a-zA-Z]{2,})([^\w\d\-/\.:!\@])| -h-> $1$2$4 # |([a-zA-Z][\w+\-]+\.[\w+\-]+\.[a-zA-Z]{2,})\s+(\d{2,4})| -h-> $1 $2 # just the site name: # But this gets mixed up with things line .tar.gz files! # |([^\w\-/\.:\@>])([a-zA-Z][\w+\-]+(\.[\w+\-]+)+\.[a-zA-Z]{2,})| # -h-> $1$2/ # just internet numbers with port: |([^\w\-/\.:\@])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s+(\d{1,4})| -h-> $1$2 $3 # just internet numbers: |([^\w\-/\.:\@])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})| -h-> $1$2 # (see "relative path") as used by Tom Fine # /\(see \"([^\"]+)\"\)/ -> $1.html # RFCs /RFC ?(\d+)/ -i-> http://www.cis.ohio-state.edu/rfc/rfc$1.txt # This would turn "f^H_o^H_o^H_" into "foo". Gross, isn't it? # Thanks to Mark O'Dell for fixing this. # # /(.\\010_)+/ -he-> $tmp = $&;$tmp =~ s@\010_@@g;"$tmp" # /(_\\010.)+/ -he-> $tmp = $&;$tmp =~ s@_\010@@g;"$tmp" # /(.\^H_)+/ -he-> $tmp = $&;$tmp =~ s@\^H_@@g;"$tmp" # /(_\^H.)+/ -he-> $tmp = $&;$tmp =~ s@_\^H@@g;"$tmp" # || -hie-> print "\n"; open(F,$2); my $foo=; close(F); $foo; # Rule for inserting actual image instead # | $& # |^\#!/usr/bin/txt2html| -hi -> '' # End of sample dictionary