Niko Partanen nikopartanen

## wiki-extraction-and-embeddings.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                nikopartanen
                / wiki-extraction-and-embeddings.md
            
            
              Last active
              September 29, 2017 17:06
            
          
    wget https://dumps.wikimedia.org/fiwiki/20170701/fiwiki-20170701-pages-articles-multistream.xml.bz2
bzip2 -d fiwiki-20170701-pages-articles-multistream.xml.bz2
python ../wikiextractor/WikiExtractor.py fiwiki-20170701-pages-articles-multistream.xml -o fiwiki-20170701
find ./fiwiki-20170701 -type f | xargs cat | sed 's/<.*>//g' | sed 's/["«»()]//g' | perl -CSAD -pe 's/ ?(\p{P}) ?/ $1 /g' | sed '/^$/d' | tr '[:upper:]' '[:lower:]' >> wiki_fin_clean.txt

fastText/fasttext skipgram -input wiki/wiki_rus_clean.txt -dim 100 -output data/model_rus_wiki
python3 vecmap/normalize_embeddings.py unit center -i data/model_rus_wiki.vec -o data/norm/model_rus_100_norm.vec

wget https://dumps.wikimedia.org/ruwiki/20170701/ruwiki-20170701-pages-articles-multistream.xml.bz2

  
## w2v_visualizer.py
# encoding: utf-8
"""
@author: BrikerMan
@contact: [email protected]
@blog: https://eliyar.biz
@version: 1.0
@license: Apache Licence
@file: w2v_visualizer.py
@time: 2017/7/30 上午9:37
@comment: Modified by Niko Partanen in 30.11.2017

## cyrillic2latin_file_renamer.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

# http://stackoverflow.com/questions/5574702/how-to-print-to-stderr-in-python
from __future__ import print_function
import sys
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

import errno

## lingtypology-example.md

      
              1 file
            
          
              0 forks
            
          
                1 comment
              
            
              1 star
            
          
                nikopartanen
                / lingtypology-example.md
            
            
              Last active
              March 10, 2018 09:30
            
          
    library(overpass)
library(sf)

settlements <- 'area[name~"Адыгея|Кабардино-Балкария|Карачаево-Черкесия|Ставропольский край|Краснодарский край"];
(node["place"~"city|village|town|hamlet|isolated_dwelling"](area););
out;'

query_result <- overpass_query(settlements)


## split_test_train.R
### This is Niko Partanen's example R script that splits the National
### Library of Finland's dataset OCR Ground Truth Pages (Swedish Fraktur)
### into line-image–text-file pairs that can be used with training
### models with Tesseract. Same approach works easily also with Ocropy
### Data source:
### https://digi.kansalliskirjasto.fi/opendata

library(tidyverse)
library(xml2)
library(measurements)

## NeedlemanWunsch.png

      
              3 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                nikopartanen
                / NeedlemanWunsch.png
            
            
              Created
              May 20, 2019 10:47
                — forked from slowkow/NeedlemanWunsch.png
            
              
                Plot the score matrix from a Needleman-Wunsch pairwise sequence alignment
              
          
## compile_language.sh
lang=$1

export GTLANG_$lang=/mnt/data/trunk/langs/$lang

cd /mnt/data/trunk/langs/$lang
make clean
./autogen.sh
./configure --with-hfst --without-xfst --enable-tokenisers --enable-reversed-intersect --enable-alignment --enable-apertium --enable-dicts --enable-morpher
make
make install

## transkribus-xml-edit-example.py
from pathlib import Path
import xml.etree.cElementTree as ET
import re

# Tieto noista ylä- ja alaindekseistä on merkitty tällaisiin elementteihin:
# textStyle {offset:13; length:1;superscript:true;}
# Niistä pitää poimia milloin niitä on, j amiten pitkiä ne ovat

def get_offset_info(offsets):


## alto2lines.py
import xml.etree.cElementTree as ET

from PIL import Image
import numpy as np
from google.protobuf.json_format import MessageToJson

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pathlib import Path
import sklearn as sk

## lingtypology_komi_example.R
library(tidyverse)
library(lingtypology)

kpv <- read_csv("https://raw.githubusercontent.com/langdoc/kpv-geography/master/kpv.csv")

map.feature(languages = kpv$language,
            features = kpv$dialect,
            label = kpv$settlement,
            latitude = kpv$latitude,
            longitude = kpv$longitude)
	# encoding: utf-8
	"""
	@author: BrikerMan
	@contact: [email protected]
	@blog: https://eliyar.biz
	@version: 1.0
	@license: Apache Licence
	@file: w2v_visualizer.py
	@time: 2017/7/30 上午9:37
	@comment: Modified by Niko Partanen in 30.11.2017
	#!/usr/bin/python
	# -- coding: utf-8 --

	# http://stackoverflow.com/questions/5574702/how-to-print-to-stderr-in-python
	from __future__ import print_function
	import sys
	def eprint(args, *kwargs):
	print(args, file=sys.stderr, *kwargs)

	import errno
	### This is Niko Partanen's example R script that splits the National
	### Library of Finland's dataset OCR Ground Truth Pages (Swedish Fraktur)
	### into line-image–text-file pairs that can be used with training
	### models with Tesseract. Same approach works easily also with Ocropy
	### Data source:
	### https://digi.kansalliskirjasto.fi/opendata

	library(tidyverse)
	library(xml2)
	library(measurements)
	lang=$1

	export GTLANG_$lang=/mnt/data/trunk/langs/$lang

	cd /mnt/data/trunk/langs/$lang
	make clean
	./autogen.sh
	./configure --with-hfst --without-xfst --enable-tokenisers --enable-reversed-intersect --enable-alignment --enable-apertium --enable-dicts --enable-morpher
	make
	make install
	from pathlib import Path
	import xml.etree.cElementTree as ET
	import re

	# Tieto noista ylä- ja alaindekseistä on merkitty tällaisiin elementteihin:
	# textStyle {offset:13; length:1;superscript:true;}
	# Niistä pitää poimia milloin niitä on, j amiten pitkiä ne ovat

	def get_offset_info(offsets):
	import xml.etree.cElementTree as ET

	from PIL import Image
	import numpy as np
	from google.protobuf.json_format import MessageToJson

	import matplotlib.pyplot as plt
	import matplotlib.patches as patches
	from pathlib import Path
	import sklearn as sk
	library(tidyverse)
	library(lingtypology)

	kpv <- read_csv("https://raw.githubusercontent.com/langdoc/kpv-geography/master/kpv.csv")

	map.feature(languages = kpv$language,
	features = kpv$dialect,
	label = kpv$settlement,
	latitude = kpv$latitude,
	longitude = kpv$longitude)