Last active
January 27, 2017 13:59
-
-
Save vseloved/c8c6cef0ef3c6065c03de06025eb76b3 to your computer and use it in GitHub Desktop.
Filter Cyrillic texts with Ukrainian letters from CC WET files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(let (cur) | |
(with-output-to-string (out) | |
(loop :for line := (read-line *standard-input* nil) :while line :do | |
(cond | |
((string= #.(format nil "WARC/1.0~C" #\Return) line) | |
(setf cur :header) | |
(let* ((outstr (string-trim '(#\Newline #\Return #\Linefeed) | |
(get-output-stream-string out))) | |
(count-i (count-if (lambda (char) | |
(member char '(#\і #\ї #\є #\І #\Ї #\Є #\Ґ #\ґ))) | |
outstr))) | |
(when (and (plusp count-i) | |
(> (+ count-i | |
(count-if (lambda (char) | |
(or (char<= #\а char #\я) | |
(char<= #\А char #\Я))) | |
outstr)) | |
(floor (length outstr) 2))) | |
(write-line outstr) | |
(write-line "===(=^.^=)===")))) | |
((and (eql :header cur) | |
(string= #.(format nil "~C" #\Return) line)) | |
(setf cur :body)) | |
((eql :body cur) | |
(write-line line out)))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To run: