POJ
排序的思想就是根据选取范围的题目的totalSubmittedNumber和totalAcceptedNumber计算一个avgAcceptRate。
每一道题都有一个value,value = acceptedNumber / avgAcceptRate + submittedNumber。
这里用到avgAcceptedRate的原因是考虑到通过的数量站的权重可能比提交的数量占更大的权重,所以给acceptedNumber乘上了一个因子。
当然计算value还有别的方法,比如POJ上volumn list页面提供的通过率:http://poj.org/problemlist
今天(2016.01.22)的时候POJ上面题号范围是[1000, 4054],但是连续取数据的话可能服务器会返回504错误,所以我就选择了每次少取一些题目爬它对应的网页。
我的程序是一个一个题目所在的页面来爬数据的,比如对于题号1000,它对应的网页是http://poj.org/problem?id=1000。这样的话我连续请求次数过多服务器就可能会返回给我一个504错误。有觉得可以的解决办法:
(1)每次少请求一些页面。比如下面的样例里面请求的页面的题号范围是[1000, 1099]。
(2)出现504错误时进行Thread.sleep(...),等待一段时间。
(3)变更User-agent。
(4)从题号页面(如http://poj.org/problemlist?volume=2)开始抓,而不是从题目页面开始抓。(这样的话只要抓31个页面就抓完了)
import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class PojProblemSort { private static double avgAcceptRate; private static List<ProblemObject> problemObjectList = new ArrayList<ProblemObject>(); // 这段代码用于获取网页源代码 private static String getPageContent(String urlString) throws Exception { URL url = new URL(urlString); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); BufferedReader reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), "UTF-8")); String line; String ans = ""; while ((line = reader.readLine()) != null){ ans += line + "\n"; } return ans; } // 这段代码关于获取题号为problemId的题目对应的网址 private static String getProblemUrl(int problemId) { return "http://poj.org/problem?id=" + problemId; } // 这个类用于记录每一道题目的基本信息 static class ProblemObject { int problemId; String title; int submittedNumber; int acceptedNumber; double value; public ProblemObject(int problemId, String title, int submittedNumber, int acceptedNumber) { this.problemId = problemId; this.title = title; this.submittedNumber = submittedNumber; this.acceptedNumber = acceptedNumber; } public void calculateValue() { value = (double) acceptedNumber / avgAcceptRate + submittedNumber; } @Override public String toString() { return problemId + ":\t" + title + "\t" + acceptedNumber + "/" + submittedNumber + " (" + ( (double) acceptedNumber / submittedNumber ) + ")\t" + value; } } private static ProblemObject parseProblemInformation(int problemId) throws Exception { String urlString = getProblemUrl(problemId); String content = getPageContent(urlString); String title = null; int acceptedNumber = 0; int submittedNumber = 0; Pattern pattern = Pattern.compile("<title>([^<]*)"); Matcher matcher = pattern.matcher(content); if (matcher.find()) { title = matcher.group(1).trim(); } else { return null; } pattern = Pattern.compile("<b>Total Submissions:</b>([^<]*)"); matcher = pattern.matcher(content); if (matcher.find()) { submittedNumber = Integer.parseInt(matcher.group(1).trim()); } else { return null; } pattern = Pattern.compile("<b>Accepted:</b>([^<]*)"); matcher = pattern.matcher(content); if (matcher.find()) { acceptedNumber = Integer.parseInt(matcher.group(1).trim()); } else { return null; } // debug // System.out.println(problemId + "\t" + title + "\t" + submittedNumber + "\t" + acceptedNumber); ProblemObject problemObject = new ProblemObject(problemId, title, submittedNumber, acceptedNumber); return problemObject; } public static void main(String[] args) { int startProblemId = 1000; int endProblemId = 1099; double totalAcceptedNumber = 0; double totalSubmittedNumber = 0; for (int problemId = startProblemId; problemId <= endProblemId; problemId ++) { System.out.println("problem id: " + problemId); ProblemObject problemObject = null; try { problemObject = parseProblemInformation(problemId); } catch (Exception e) { e.printStackTrace(); } if (problemObject != null) { totalAcceptedNumber += problemObject.acceptedNumber; totalSubmittedNumber += problemObject.submittedNumber; problemObjectList.add(problemObject); } } avgAcceptRate = (double) totalAcceptedNumber / (double) totalSubmittedNumber; for (ProblemObject problemObject : problemObjectList) { problemObject.calculateValue(); } Collections.sort(problemObjectList, new Comparator<ProblemObject>() { @Override public int compare(ProblemObject o1, ProblemObject o2) { return o1.value > o2.value ? -1 : 1; } }); for (ProblemObject problemObject : problemObjectList) { System.out.println(problemObject); } } }
前100道题的排序结果(按value的计算结果从易到难):
1000: 1000 -- A+B Problem 210240/376297 (0.558707616590087) 968725.9801441709
1002: 1002 -- 487-3279 47583/267412 (0.17793891074446921) 401494.70625095174
1004: 1004 -- Financial Management 63395/169851 (0.37323889762203344) 348489.86603995296
1003: 1003 -- Hangover 55430/113487 (0.4884259871174672) 269681.53181788145
1001: 1001 -- Exponentiation 37121/152436 (0.24351859140885354) 257038.1507416846
1006: 1006 -- Biorhythms 39323/124555 (0.3157079201958974) 235362.10039102565
1011: 1011 -- Sticks 31019/132077 (0.23485542524436503) 219484.50825291115
1005: 1005 -- I Think I Need a Houseboat 41460/95226 (0.4353852939323294) 212054.88849304285
1007: 1007 -- DNA Sorting 37034/92137 (0.40194492983274904) 196493.9960552665
1088: 1088 -- 滑雪 32502/86863 (0.3741754256703084) 178449.40940185427
1061: 1061 -- 青蛙的约会 19476/101103 (0.19263523337586422) 155983.8353181501
1008: 1008 -- Maya Calendar 22303/72459 (0.30780165334879034) 135305.95369175915
1014: 1014 -- Dividing 17011/65484 (0.2597733797568872) 113418.78586963704
1050: 1050 -- To the Max 23751/44853 (0.5295297973379707) 111780.22939214329
1012: 1012 -- Joseph 19386/50923 (0.3806924179643776) 105550.22702185549
1017: 1017 -- Packets 16573/48917 (0.33879837275384833) 95617.55882767003
1013: 1013 -- Counterfeit Dollar 13654/43045 (0.3172029271692415) 81520.1964178487
1062: 1062 -- 昂贵的聘礼 12454/42515 (0.29293190638598143) 77608.75246725412
1046: 1046 -- Color Me Less 15811/32557 (0.4856405688484811) 77110.34191904246
1067: 1067 -- 取石子游戏 12876/38484 (0.3345806049267228) 74766.89358987988
1028: 1028 -- Web Navigation 14297/32029 (0.4463767210965063) 72316.08680137564
1019: 1019 -- Number Sequence 10602/36691 (0.2889536943664659) 66566.05730350314
1077: 1077 -- Eight 12333/28242 (0.4366900361164224) 62994.79020223583
1068: 1068 -- Parencodings 13885/23686 (0.5862112640378283) 62812.12437833816
1094: 1094 -- Sorting It All Out 10853/31220 (0.34762972453555413) 61802.34266316918
1042: 1042 -- Gone Fishing 9626/31817 (0.3025426658704466) 58941.81622368621
1083: 1083 -- Moving Tables 9683/29050 (0.33332185886402754) 56335.43481133946
1064: 1064 -- Cable master 6959/32595 (0.21349900291455745) 52204.55704348975
1018: 1018 -- Communication System 9182/25736 (0.3567764998445757) 51609.68196196622
1035: 1035 -- Spell checker 8515/23384 (0.36413787204926445) 47378.1626994274
1080: 1080 -- Human Gene Functions 10192/18345 (0.5555737258108476) 47064.73062038333
1015: 1015 -- Jury Compromise 6998/26666 (0.26243156078901975) 46385.45397188408
1065: 1065 -- Wooden Sticks 8579/20337 (0.4218419629247185) 44511.506376792444
1059: 1059 -- Chutes and Ladders 870/39303 (0.022135714830928938) 41754.54686418107
1032: 1032 -- Parliament 7536/17839 (0.42244520432759686) 39074.46800973399
1016: 1016 -- Numbers That Count 6556/19550 (0.3353452685421995) 38023.95545008174
1045: 1045 -- Bode Plot 8612/13702 (0.6285213837396001) 37969.49608543379
1026: 1026 -- Cipher 5680/20750 (0.2737349397590361) 36755.50136614769
1009: 1009 -- Edge Detection 4603/19800 (0.23247474747474747) 32770.655420489056
1029: 1029 -- False coin 5019/17937 (0.2798126777053019) 32079.889323361844
1010: 1010 -- STAMPS 4955/17340 (0.28575547866205303) 31302.545645996797
1020: 1020 -- Anniversary Cake 5282/16175 (0.3265533230293663) 31058.98912253382
1087: 1087 -- A Plug for UNIX 5183/15316 (0.33840428310263776) 29921.01999660977
1056: 1056 -- IMMEDIATE DECODABILITY 6049/12635 (0.47874950534230315) 29680.29538095553
1047: 1047 -- Round and Round We Go 5794/12407 (0.46699443862335777) 28733.73854145418
1036: 1036 -- Gangsters 3366/12034 (0.2797074954296161) 21518.950281417805
1054: 1054 -- The Troublesome Frog 3332/11187 (0.2978457137749173) 20576.142702817626
1051: 1051 -- P,MTHBGWB 4434/7747 (0.5723505873241255) 20241.435397446985
1038: 1038 -- Bugs Integrated, Inc. 3721/9716 (0.38297653355290245) 20201.294116802037
1023: 1023 -- The Fun Number System 3482/10241 (0.3400058588028513) 20052.823196641948
1039: 1039 -- Pipe 3032/9889 (0.3066032965921731) 18432.78171516898
1095: 1095 -- Trees Made to Order 4025/7055 (0.5705173635719348) 18396.926584285997
1091: 1091 -- 跳蚤 2818/9372 (0.3006828851899274) 17312.757543979613
1063: 1063 -- Flip and Shift 3343/7132 (0.46873247335950646) 16552.139272364744
1089: 1089 -- Intervals 3080/7780 (0.39588688946015427) 16459.039473192766
1041: 1041 -- John's trip 2713/8061 (0.33655873961047017) 15705.881198302588
1037: 1037 -- A decorative fence 2620/7021 (0.37316621563879787) 14403.819292131506
1066: 1066 -- Treasure Hunt 2558/6166 (0.41485566007135904) 13374.111354684119
1082: 1082 -- Calendar Game 2495/5259 (0.4744247955885149) 12289.585547277904
1027: 1027 -- The Same Game 1970/5254 (0.37495241720593836) 10805.203818892775
1060: 1060 -- Modular multiplication of polynomials 1978/4375 (0.4521142857142857) 9948.746778563403
1040: 1040 -- Transportation 1753/4313 (0.4064456294922328) 9252.72603782692
1099: 1099 -- Square Ice 1600/4104 (0.3898635477582846) 8612.59193412611
1079: 1079 -- Ratio 1500/4046 (0.3707365299060801) 8272.80493824323
1033: 1033 -- Defragment 1414/4067 (0.3476764199655766) 8051.468121783951
1021: 1021 -- 2D-Nim 1572/3483 (0.45133505598621876) 7912.691575278904
1084: 1084 -- Square Destroyer 1492/3487 (0.42787496415256665) 7691.261978572598
1053: 1053 -- Set Me 1397/3051 (0.4578826614224844) 6987.56433248386
1031: 1031 -- Fence 1150/3423 (0.3359626059012562) 6663.550452653142
1090: 1090 -- Chain 1104/3350 (0.3295522388059702) 6460.928434547017
1085: 1085 -- Triangle War 1197/3036 (0.3942687747035573) 6408.990340718097
1034: 1034 -- The dog task 1165/2896 (0.4022790055248619) 6178.818502035574
1049: 1049 -- Microprocessor Simulation 922/3161 (0.29167984814931985) 5759.0761020401715
1044: 1044 -- Date bugs 879/2984 (0.2945710455764075) 5460.907693810532
1043: 1043 -- What's In A Name? 908/2531 (0.3587514816278151) 5089.625922616568
1057: 1057 -- FILE MAPPING 1031/2101 (0.49071870537839124) 5006.223927552513
1024: 1024 -- Tester Program 872/2511 (0.34727200318598167) 4968.182604098731
1022: 1022 -- Packing Unit 4D Cubes 747/2266 (0.3296557811120918) 4370.948859245128
1093: 1093 -- Formatting Text 651/2461 (0.2645266151970744) 4295.433343197561
1048: 1048 -- Follow My Logic 583/2042 (0.28550440744368266) 3684.818185997202
1025: 1025 -- Department 441/1854 (0.23786407766990292) 3096.6806518435096
1069: 1069 -- The Bermuda Triangle 620/1292 (0.47987616099071206) 3039.079374473868
1078: 1078 -- Gizilch 561/1455 (0.38556701030927837) 3035.8250469029676
1096: 1096 -- Space Station Shielding 533/1511 (0.35274652547981467) 3012.9246880557607
1030: 1030 -- Rating 400/1730 (0.23121387283236994) 2857.1479835315276
1058: 1058 -- The Gourmet Club 465/1524 (0.3051181102362205) 2834.309530855401
1092: 1092 -- Farmland 502/1315 (0.3817490494296578) 2729.5707193320673
1071: 1071 -- Illusive Chase 531/1181 (0.44961896697713805) 2677.288948138103
1072: 1072 -- Puzzle Out 319/1167 (0.27335047129391604) 2065.9005168663934
1097: 1097 -- Roads Scholar 318/859 (0.370197904540163) 1755.0826469075646
1055: 1055 -- BULK MAILING 278/961 (0.2892819979188345) 1744.367848554412
1074: 1074 -- Parallel Expectations 275/894 (0.3076062639821029) 1668.9142386779254
1052: 1052 -- Plato's Blocks 309/767 (0.4028683181225554) 1637.7218172781052
1086: 1086 -- Unscrambling Images 294/749 (0.3925233644859813) 1577.4537678956729
1081: 1081 -- You Who? 268/794 (0.33753148614609574) 1549.1891489661236
1076: 1076 -- Bowl 201/915 (0.21967213114754097) 1481.3918617245927
1073: 1073 -- The Willy Memorial Program 207/710 (0.2915492957746479) 1293.2990814775656
1075: 1075 -- University Entrance Examination 185/624 (0.296474358974359) 1145.3059423833315
1098: 1098 -- Robots 178/566 (0.31448763250883394) 1067.5808526715298
1070: 1070 -- Deformed Wheel 143/652 (0.21932515337423314) 1054.9554041125211