如果您有更好的解决方案,不妨共享出来!
题目如下:
Below is the requirement for the project. You need to have a configuration file in which you should have an URL and the starting number of the puzzle, e.g. the number is 1, means the program will extract all puzzles, if the number is 50, then it will extract puzzles starting from puzzle 50 to the end.
This is a very straight-forward screen scraping job which can be done in your language of choice: perl, python, php, java, net, ruby, whatever. Deliverables will be excel output and code.
We need to screenscrape data for the Arukone puzzles located here at the following URL:
http://www.menneske.no/arukone/5x5/eng/?number=1
Output should be one excel file includes all puzzles.
Every puzzle should be outputed like: puzzle number, difficulty level, and location of each numbered point in the order (number, row coordinate, column coordnate).
For example, the sample 5x5 puzzle located at http://www.menneske.no/arukone/5x5/eng/?number=499
would have output:
499,super easy,3,0,4
499,super easy,2,1,1
499,super easy,3,1,3
499,super easy,1,1,4
499,super easy,2,3,4
499,super easy,1,4,4
There should be 1,434(constant) puzzles for the 5x5 excel file.
If you have questions, please do not hesitate to call or write I,our senior engineer will answer all your questions or guide you through the test.
Please reply with your estimated finish date.
大致的意思为:
import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.RandomAccessFile; import java.net.URL; import java.net.URLConnection; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.channels.FileChannel.MapMode; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class PuzzleParsor { private static class Config{ private String solutionPath; private int startIndex; private int maxIndex; public Config(String solution, int startIndex, int maxIndex){ this.solutionPath = solution; this.startIndex = startIndex; this.maxIndex = maxIndex; } public String getSolutionPath() { return solutionPath; } public int getStartIndex() { return startIndex; } public int getMaxIndex() { return maxIndex; } } /* private static final int NORTH = 8; private static final int WEST = 16; private static final int EAST = 32; private static final int SOUTH = 64; */ private static final String lineSeparator = System.getProperty("line.separator"); public static void main(String[] args) throws Exception { Config config = getConfig(); String solutionPath = config.getSolutionPath(); int startIndex = config.getStartIndex(); int maxIndex = config.getMaxIndex(); for(int i = startIndex; i <= maxIndex; i ++){ //输出 output(extractPuzzleSolutionFromUrl(i, solutionPath + i)); } } private static void output(List<String> list) throws Exception{ StringBuffer stringBuffer = new StringBuffer(); for(String str : list){ stringBuffer.append(str); } RandomAccessFile resultFile = new RandomAccessFile("result.xls", "rw"); resultFile.getChannel().write(Charset.forName("utf-8").encode(stringBuffer.toString()), resultFile.length()); resultFile.close(); } private static List<String> extractPuzzleSolutionFromUrl(int index, String urlName) throws Exception{ System.out.println("process:" + index); List<String> resultList = new ArrayList<String>(); String content = extractPuzzleTableFromUrl(urlName); Pattern difficultyPattern = Pattern.compile("Difficulty: (.*?)<br"); Matcher difficultyMatcher = difficultyPattern.matcher(content); String difficulty = ""; if(difficultyMatcher.find()) difficulty = difficultyMatcher.group(1); Pattern pattern = Pattern.compile("<tr class=\"arukone\">(.*?)</tr>"); Matcher matcher = pattern.matcher(content); List<String> rowList = new ArrayList<String>(); while(matcher.find()){ rowList.add(matcher.group(1)); } int[][] table = parseTableInfoToCharArray(rowList); for(int i = 0; i < table.length; i ++){ for(int j = 0; j < table[i].length; j ++){ if(table[i][j] != 0){ resultList.add(index + "," + difficulty + "," + table[i][j] + "," + i + "," + j + lineSeparator); } } } return resultList; } private static int[][] parseTableInfoToCharArray(List<String> rowList){ int[][] array = new int[rowList.size()][]; for(int i = 0; i < rowList.size(); i ++){ array[i] = parseRowInfoToArray(rowList.get(i)); } return array; } private static int[] parseRowInfoToArray(String info){ String[] cells = info.split("</td>"); int[] result = new int[cells.length]; Pattern pattern = Pattern.compile("<td class=\"[^>]*\">([^<]+)"); for(int i = 0; i < cells.length; i ++){ Matcher matcher = pattern.matcher(cells[i]); if(matcher.find()){ String value = matcher.group(1); result[i] = Integer.parseInt(value); } } return result; } private static String extractPuzzleTableFromUrl(String urlName) throws Exception{ URL url = new URL(urlName); URLConnection urlConnection = url.openConnection(); InputStream inputStream = urlConnection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); StringBuffer stringBuffer = new StringBuffer(); String str = null; while((str = reader.readLine()) != null){ stringBuffer.append(str); } String content = stringBuffer.toString(); int startIndex = content.indexOf("<!-- main -->"); int endIndex = content.indexOf("<!-- footer -->"); return content.substring(startIndex, endIndex); } private static Config getConfig() throws Exception{ RandomAccessFile file = new RandomAccessFile("config.txt", "r"); ByteBuffer buffer = file.getChannel().map(MapMode.READ_ONLY, 0, file.length()); CharBuffer charBuffer = Charset.forName("utf-8").decode(buffer); String content = charBuffer.toString(); String solution = extractInfo(content, lineSeparator, "path"); int maxIndex = Integer.parseInt(extractInfo(content, lineSeparator, "maxIndex")); int startIndex = Integer.parseInt(extractInfo(content, lineSeparator, "startIndex")); file.close(); return new Config(solution, startIndex, maxIndex); } private static String extractInfo(String content, String lineSeparator, String name){ Pattern pathPattern = Pattern.compile(name + "=(.*)(?:" + lineSeparator + "|$)"); Matcher pathMatcher = pathPattern.matcher(content); if(pathMatcher.find()) return pathMatcher.group(1); return ""; } }
path=http://www.menneske.no/arukone/5x5/eng/showpuzzle.html?number= startIndex=1400 maxIndex=1413