import java.io.*;
import java.util.regex.*;
import org.apache.log4j.Logger;
public class RegexpMakeHyperlinks {
Logger log = Logger.getLogger(this.getClass());
/**
* Loads a file into a StringBuffer
* @param f File to read
* @return StringBuffer with contents of text file.
* @throws IOException
*/
public StringBuffer loadFile(File f) throws IOException {
log.debug("Reading file " + f.getAbsolutePath());
BufferedReader in = new BufferedReader(new FileReader(f));
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(new BufferedWriter(sw));
String line = null;
while ((line = in.readLine()) != null) {
pw.println(line);
}
pw.flush();
in.close();
return sw.getBuffer();
}
/**
* Writes the contents of a StringBuffer to file
* @param f File to write to
* @param sb Text to write to file
* @throws IOException
*/
public void writeToFile(File f, StringBuffer sb) throws IOException {
log.debug("Outputting to " + f.getAbsolutePath());
BufferedReader in = new BufferedReader(new StringReader(sb.toString()));
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(f)));
String line = null;
while ((line = in.readLine()) != null) {
out.println(line);
}
out.flush();
in.close();
}
/**
* Converts all urls like "www.google.com" into hyperlinks in the text.
*
* @param initialText The text to convert
* @return Converted text.
*/
public StringBuffer convertText(StringBuffer initialText) {
StringBuffer result = new StringBuffer(initialText.length());
Pattern p = Pattern.compile("(@)?(href=')?(HREF=')?(HREF=\")?(href=\")?(http://)?[a-zA-Z_0-9\\-]+(\\.\\w[a-zA-Z_0-9\\-]+)+(/[#;@&\\n\\-=?\\+\\%/\\.\\w]+)?");
Matcher m = p.matcher(initialText);
while (m.find()) {
String href = m.group();
//log.debug("href:"+href);
if (href.startsWith("@")) {
continue;
}
// ignore links that are already hyperlinks
if (href.startsWith("href")) {
continue;
}
//TO DO: add more top domains
if (href.indexOf(".br") != -1 || href.indexOf(".com") != -1 || href.indexOf(".COM") != -1 || href.indexOf(".net") != -1 || href.indexOf(".org") != -1 || href.indexOf(".se") != -1 || href.indexOf(".pt") != -1 || href.indexOf(".es") != -1) {
if (!href.startsWith("http://") && !href.startsWith("HTTP://")) {
// add on the http:// if necessary
m.appendReplacement(result, "" + href + "");
} else {
m.appendReplacement(result, "" + href + "");
}
}
}
m.appendTail(result);
return result;
}
/**
* Testing method for experimenting with the patternmatching
* @param text
*/
public void matchLink(String text) {
Pattern p = Pattern.compile("(http://)?www\\..*?.com\\b");
Matcher m = p.matcher(text);
while (m.find()) {
log.debug("Found: " + m.group());
}
}
}
20080205
Make hyperlinks
Subscribe to:
Post Comments (Atom)
1 comment:
According to RFC 3305 (appendix b, p 50) a better regexp could be ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
Post a Comment