Wednesday, 7 August 2013

crawler4j compile error with class CrawlConfig - VariableDeclaratorId Expected

crawler4j compile error with class CrawlConfig - VariableDeclaratorId
Expected

The code will not compile. I changed the JRE to 1.7. The compiler does not
highlight the class in Eclipse and the CrawlConfig appears to fail in the
compiler. The class should be run from the command line in Linux.
Any ideas?
Compiler Error - Description Resource Path Location Type Syntax error on
token "crawlStorageFolder", VariableDeclaratorId expected after this token
zeocrawler.java /zeowebcrawler/src/main/java/com/example line 95 Java
Problem
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;
public class Controller {
String crawlStorageFolder = "/data/crawl/root";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new
RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config,
pageFetcher, robotstxtServer);
controller.addSeed("http://www.senym.com");
controller.addSeed("http://www.merrows.co.uk");
controller.addSeed("http://www.zeoic.com");
controller.start(MyCrawler.class, numberOfCrawlers);
}
public URLConnection connectURL(String strURL) {
URLConnection conn =null;
try {
URL inputURL = new URL(strURL);
conn = inputURL.openConnection();
int test = 0;
}catch(MalformedURLException e) {
System.out.println("Please input a valid URL");
}catch(IOException ioe) {
System.out.println("Can not connect to the URL");
}
return conn;
}
public static void updatelongurl()
{
// System.out.println("Short URL: "+ shortURL);
// urlConn = connectURL(shortURL);
// urlConn.getHeaderFields();
// System.out.println("Original URL: "+ urlConn.getURL());
/* connectURL - This function will take a valid url and return a
URL object representing the url address. */
}
public class MyCrawler extends WebCrawler {
private Pattern FILTERS =
Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
+
"|png|tiff?|mid|mp2|mp3|mp4"
+
"|wav|avi|mov|mpeg|ram|m4v|pdf"
+
"|rm|smil|wmv|swf|wma|zip|rar|gz))$");
/**
* You should implement this function to specify whether
* the given url should be crawled or not (based on your
* crawling logic).
*/
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() &&
href.startsWith("http://www.ics.uci.edu/");
}
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData)
page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " +
links.size());
}
}
}

No comments:

Post a Comment