... urgently, under the cover of darkness, download a full dump of all quotes on moderation [ http://vpustotu.ru/moderation/ ] for further secret research ...
Thus, you need a program that:
- Must consistently update and parse (parse) the page, writing down the quote.
- Must be able to discard duplicates.
- Must stop not only on command, but also to achieve a certain number of "repetitions", for example 500!
- Since this will most likely take some time: you must be able to continue “from the place where you left off” after closing.
- Well, since all the same it is a long time - let him do his dirty business in several streams. Well, in as many as 4 threads (or even 5!).
- And reports on the success in the console every, say, 10 seconds.
- And let them take all these parameters from the command line arguments!
private static class CommandLine { @Parameter(names = "-h", help = true) boolean help; @Parameter(names = "-w", description = " ") int workers = 2; @Parameter(names = "-r", description = " ()") int reportPeriod = 10; @Parameter(names = "-d", description = "- ") int dupToStop = 500; @Parameter(names = "-hf", description = " ") String hashFile = "hash.bin"; @Parameter(names = "-qf", description = " ") String quotesFile = "quotes.txt"; } ... CommandLine commandLine = new CommandLine(); // JCommander commander = new JCommander(commandLine, args); // if (commandLine.help) commander.usage(); // , ...
var ( WORKERS int = 2 //- "" REPORT_PERIOD int = 10 // () DUP_TO_STOP int = 500 // HASH_FILE string = "hash.bin" // QUOTES_FILE string = "quotes.txt" // used map[string]bool = make(map[string]bool) //map , - . ) func init() { // : flag.IntVar(&WORKERS, "w", WORKERS, " ") flag.IntVar(&REPORT_PERIOD, "r", REPORT_PERIOD, " ()") flag.IntVar(&DUP_TO_STOP, "d", DUP_TO_STOP, "- ") flag.StringVar(&HASH_FILE, "hf", HASH_FILE, " ") flag.StringVar("ES_FILE, "qf", QUOTES_FILE, " ") // flag.Parse() }
BlockingQueue<String> queue = new ArrayBlockingQueue<>(10);
new Thread(new Grabber()).start();
Thread worker = new Thread(new Grabber()); worker.setPriority(2); worker.setDaemon(true); worker.start();
public class Grabber implements Runnable{ ... public void run() { try { while (true) { // Document doc = Jsoup.connect("http://vpustotu.ru/moderation/").get(); Element element = doc.getElementsByClass("fi_text").first(); if (element != null){ queue.put(element.text()); // } } } catch (IOException | InterruptedException e) { e.printStackTrace(); } }
func() { for { // x, err := goquery.ParseUrl("http://vpustotu.ru/moderation/") if err == nil { if s := strings.TrimSpace(x.Find(".fi_text").Text()); s != "" { c <- s // } } time.Sleep(100 * time.Millisecond) } }
// InputStream hashStream = Files.newInputStream(Paths.get(commandLine.hashFile) // OutputStream hashFile = Files.newOutputStream(Paths.get(commandLine.hashFile), CREATE, APPEND, WRITE);
// hash_file, err := os.OpenFile(HASH_FILE, os.O_RDONLY, 0666) // hash_file, err := os.OpenFile(HASH_FILE, os.O_APPEND|os.O_CREATE, 0666)
public static void main(String[] args) throws IOException
try ( OutputStream hashFile = Files.newOutputStream(Paths.get(commandLine.hashFile), CREATE, APPEND, WRITE); InputStream hashStream = Files.newInputStream(Paths.get(commandLine.hashFile)); BufferedWriter quotesFile = Files.newBufferedWriter(Paths.get(commandLine.quotesFile), Charset.forName("UTF8"), CREATE, APPEND, WRITE);) { ... }
Hex.encodeHexString(hash);
static String encodeHexString(byte[] a) { StringBuilder sb = new StringBuilder(); for (byte b : a) sb.append(String.format("%02x", b & 0xff)); return sb.toString(); }
new String(hash, "UTF16");
static Set<byte[]> hashes = new TreeSet<>(new Comparator<byte[]>() { public int compare(byte[] a1, byte[] a2) { int result = a1.length - a2.length; if (result == 0){ for (int i = 0; i < a1.length; i++){ result = a1[i] - a2[i]; if (result != 0) break; } } return result; }; });
Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { System.out.printf(" . : " + hashes.size()); } });
new Timer(commandLine.reportPeriod * 1000, new ActionListener() { @Override public void actionPerformed(ActionEvent arg0) { System.out.printf(" %d / %d (%d /) \n", hashes.size(), dupCount, quotesCount/commandLine.reportPeriod); quotesCount = 0; } }).start();
Source: https://habr.com/ru/post/197926/