#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys

#Prototype / proof-of-concept for query langauge detection / usefulness


#Weights:
#  0.0   = useless. Use this _very_ sparingly
#  1.0   = normal/average/neutral
#  10.0  = perfect(ish)



language_codes = [
	"cs",	#slovak
	"da",	#danish
	"de",	#german
	"el",	#greek
	"en",	#english
	"es",	#spanish
	"fi",	#finnish
	"fo",	#faeroese
	"fr",	#french
	"hu",	#hungarian
	"is",	#icelandic
	"it",	#italian
	"nl",	#dutch
	"no",	#norwegian
	"pl",	#polish
	"pt",	#portuguese
	"ro",	#romanian
	"sk",	#slovak
	"sv"	#swedish
]

country_codes =  [
	"dk", "no", "se", "is", "fo",
	"fi",
	"nl", "de", "be",
	"fr", "pt", "es", "it", "ro", "mc", "sm", "va", "ad",
	"ch", "at", "li",
	"uk", "ie", "us",
	"ca",
	"hu",
	"gr",
	"cy"
]
	


#Language mutual intelligibility
#	(ulang,dlang)  -> weight
language_intelligibility = {}



#country language understanding
#  (cc,lang) -> weight
country_language_weight = {}



#TLD/domain to language weights
#	tld -> [weight]
#tld_language_weight
#NOT implemented in this prototype. The only interesting case right now is .eu, but that just boosts all european languages.


#dictionary of word-to-languages
#	word -> [lang...]
dictionary = {}




#letter to keyboard/language/country weight
# char -> {lang:weight}
char_language_weight = {}


#browser-language boosts, depending on order. We can't use the accept-lang "q=" parts because the users cannot chose that themselves.
#elements must be in range [1…10]
blang_count_weights = [10.0, 7.0, 4.0, 3.0, 2.0, 1.5, 1.2, 1.0]



######################################################################
# Preload data

# Load data for intelligibility
#default:
for lc1 in language_codes:
	for lc2 in language_codes:
		language_intelligibility[(lc1,lc2)] = 1.0
#all languages are intelligible to themselves:
for lc in language_codes:
	language_intelligibility[(lc,lc)] = 10.0
#continental north common-Germanic, which is a complicated way of saying "north-Germanic languages in Fennoscandia and Denmark" :-)
language_intelligibility[("da","no")] = 7.0
language_intelligibility[("da","sv")] = 5.0
language_intelligibility[("no","da")] = 5.0
language_intelligibility[("no","sv")] = 7.0
language_intelligibility[("sv","da")] = 5.0
language_intelligibility[("sv","no")] = 7.0
#romance language family intelligibility is asymmetric
language_intelligibility[("es","it")] = 2.0
language_intelligibility[("es","pt")] = 3.0
language_intelligibility[("pt","es")] = 2.0
language_intelligibility[("it","es")] = 2.0
#romanian is left out because my Romanian coworker didn't understand any Italian, so it has probably been isolated for too long to be intelligible with other romance langauges.
#czech/slovak is somewhat intelligible, esp. for the older generations
language_intelligibility[("cs","sk")] = 2.0
language_intelligibility[("sk","cs")] = 2.0

# Load data for country/language
for cc in country_codes:
	for lc in language_codes:
		country_language_weight[(cc,lc)] = 1.0
#monolingual countries
country_language_weight[("dk","da")] = 10.0
country_language_weight[("no","no")] = 10.0
country_language_weight[("se","sv")] = 10.0
country_language_weight[("is","is")] = 10.0
country_language_weight[("fo","fo")] = 10.0
country_language_weight[("de","de")] = 10.0
country_language_weight[("at","de")] = 10.0
country_language_weight[("nl","nl")] = 10.0
country_language_weight[("fr","fr")] = 10.0
country_language_weight[("es","es")] = 10.0
country_language_weight[("pt","pt")] = 10.0
country_language_weight[("it","it")] = 10.0
country_language_weight[("ro","ro")] = 10.0
country_language_weight[("uk","en")] = 10.0
country_language_weight[("ie","en")] = 10.0
country_language_weight[("ro","ro")] = 10.0
country_language_weight[("gr","el")] = 10.0
country_language_weight[("us","en")] = 10.0 #mostly, ignoring the largish spanish-speaking immigrant population
country_language_weight[("li","de")] = 10.0
country_language_weight[("mc","fr")] = 10.0
country_language_weight[("sm","it")] = 10.0
country_language_weight[("cy","el")] = 10.0
#multilingual countries
country_language_weight[("be","nl")] = 7.0
country_language_weight[("be","fr")] = 7.0
country_language_weight[("ch","de")] = 7.0
country_language_weight[("ch","fr")] = 7.0
country_language_weight[("fi","fi")] = 7.0
country_language_weight[("fi","sv")] = 7.0
country_language_weight[("ca","en")] = 7.0
country_language_weight[("ca","fr")] = 7.0
country_language_weight[("va","lt")] = 7.0	#officially…
country_language_weight[("va","it")] = 5.0	#probably
country_language_weight[("va","fr")] = 3.0	#swiss garde
country_language_weight[("ad","es")] = 9.0
country_language_weight[("ad","fr")] = 7.0
#Taught 1st and 2hnd foreign languages, and minority languages (eg italian in switzerland or portuguese in luxembourg)
country_language_weight[("dk","en")] = 7.0
country_language_weight[("dk","de")] = 4.0
country_language_weight[("no","en")] = 7.0
country_language_weight[("no","de")] = 4.0
country_language_weight[("se","en")] = 7.0
country_language_weight[("se","de")] = 4.0
country_language_weight[("is","da")] = 5.0
country_language_weight[("fo","da")] = 6.0
country_language_weight[("de","en")] = 5.0
country_language_weight[("de","fr")] = 2.0
country_language_weight[("de","la")] = 1.5
country_language_weight[("nl","en")] = 7.0
country_language_weight[("nl","de")] = 6.0
country_language_weight[("pt","en")] = 3.0
country_language_weight[("fr","en")] = 3.0
country_language_weight[("it","en")] = 2.0
country_language_weight[("ch","it")] = 1.5
country_language_weight[("at","en")] = 7.0
country_language_weight[("at","fr")] = 3.0
country_language_weight[("at","it")] = 3.0
country_language_weight[("gr","en")] = 4.0	#source: "Foreign Language Learning In Greek Schools"
country_language_weight[("gr","fr")] = 3.0	#ditto
country_language_weight[("gr","de")] = 3.0	#ditto
country_language_weight[("gr","it")] = 2.0	#ditto
country_language_weight[("gr","es")] = 2.0	#ditto
country_language_weight[("gr","tr")] = 1.5	#ditto
country_language_weight[("gr","ru")] = 1.5	#ditto
country_language_weight[("us","es")] = 1.5	#there is a large Spanish-speaking minority in the US.


#Very selective dictionary just to validate concept
dictionary["plankebøf"] = ["da"]
dictionary["plankebiff"] = ["no"]
dictionary["plankstek"] = ["sv"]
dictionary["ålegilde"] = ["da","no"]
dictionary["restaurant"] = ["da","no","sv","de","en","nl","is","fo"]
dictionary["ristorante"] = ["it"]
dictionary["restaurante"] = ["es","pt"]
dictionary["ragù"] = ["it"]
dictionary["gondola"] = ["it"]
dictionary["currywurst"] = ["de"]
dictionary["oktoberfest"] = ["de"]
dictionary["zimmer"] = ["de"]
dictionary["cheval"] = ["fr"]
dictionary["infant"] = ["fr"]
dictionary["junge"] = ["de"]
dictionary["pålegg"] = ["no"]
dictionary["yksi"] = ["fi"]
dictionary["sleeping"] = ["en"]
dictionary["ελληνικά"] = ["el"]



# Load char_language
# weight<1 = less likely, not used in native words;  weight>1 = letter somewhat unique to language
#                             cs	da	de	en	es	fi	fo	fr	hu	is	it	nl	no	pl	pt	ro	sk	sv
char_language_weight['c'] = {										"is":0.9								}
char_language_weight['j'] = {											"it":0.7							}
char_language_weight['í'] = {"cs":2.0,						"fo":2.0,		"is":2.0,						"sk":2.0	}
char_language_weight['k'] = {					"es":0.9,					"it":0.7							}
char_language_weight['ó'] = {										"is":2.0								}
char_language_weight['q'] = {						"fi":0.9,							"pl":0.9				}
char_language_weight['v'] = {														"pl":0.9				}
char_language_weight['w'] = {		"da":0.8,			"fi":0.7,				"it":0.7,	"no":0.7,				"sv":0.8}
char_language_weight['y'] = {											"it":0.7							}
char_language_weight['æ'] = {		"da":2.0,				"fo":2.0,		"is":2.0,		"no":2.0					}
char_language_weight['ø'] = {		"da":2.0,				"fo":2.0,					"no":2.0					}
char_language_weight['å'] = {		"da":2.0,										"no":2.0,				"sv":2.0}
char_language_weight['ä'] = {			"de":2.0,		"fi":2.0,										"sk":2.0,"sv":2.0}
char_language_weight['ð'] = {							"fo":2.0,		"is":2.0								}
char_language_weight['þ'] = {										"is":2.0								}
char_language_weight['ö'] = {			"de":2.0,					"hu":2.0,"is":2.0,							"sv":2.0}
char_language_weight['ü'] = {			"de":2.0,					"hu":2.0,									}
char_language_weight['ñ'] = {					"es":2.0,									"pt":2.0			}
char_language_weight['õ'] = {															"pt":2.0			}
char_language_weight['ã'] = {															"pt":2.0			}
char_language_weight['ù'] = {											"it":2.0							}
char_language_weight['ç'] = {					"es":2.0,		"fr":2.0,						"pt":2.0			}
char_language_weight['ú'] = {															"pt":2.0			}
char_language_weight['ą'] = {														"pl":2.0				}
char_language_weight['ł'] = {														"pl":2.0				}
char_language_weight['ź'] = {														"pl":2.0				}
char_language_weight['ż'] = {														"pl":2.0				}
char_language_weight['ß'] = {			"de":2.0															}
char_language_weight['č'] = {"cs":2.0,																"sk":2.0	}
char_language_weight['ď'] = {"cs":2.0																		}
char_language_weight['ǳ'] = {"cs":2.0,																"sk":2.0	}
char_language_weight['ǆ'] = {"cs":2.0,																"sk":2.0	}
char_language_weight['ě'] = {"cs":2.0																		}
char_language_weight['ň'] = {"cs":2.0,																"sk":2.0	}
char_language_weight['ř'] = {"cs":2.0																		}
char_language_weight['š'] = {"cs":2.0,																"sk":2.0	}
char_language_weight['ť'] = {"cs":2.0,																"sk":2.0	}
char_language_weight['ý'] = {"cs":2.0,						"fo":2.0,									"sk":2.0	}
char_language_weight['ž'] = {"cs":2.0,																"sk":2.0	}
char_language_weight['ő'] = {									"hu":2.0									}
char_language_weight['ű'] = {									"hu":2.0									}
#boost greek letters for greek but don't penalize other languages because greek letters are frequently used for technical/scientific stuff
for codepoint in range(0x0391,0x03a9+1):
	char_language_weight[chr(codepoint)] = {"el":2.0}
for codepoint in range(0x03b1,0x03c9+1):
	char_language_weight[chr(codepoint)] = {"el":2.0}


if len(sys.argv)<4:
	print("Usage: %s <country-code> <browserlang-list> Query..."%sys.argv[0], file=sys.stderr)
	print("Examples:", file=sys.stderr)
	print("	%s de de,en,fr currywurst"%sys.argv[0], file=sys.stderr)
	print("	%s it \"\" ragù"%sys.argv[0], file=sys.stderr)
	print("	%s us en What is the airspeed velocity of an unladen swallow?"%sys.argv[0], file=sys.stderr)
	sys.exit(1)

cc = sys.argv[1]
blangs = sys.argv[2]
query_words=sys.argv[3:]

print("cc=",cc)
print("blangs=",blangs)
print("query_words=",query_words)

def weights_str(weights):
	s = "{"
	first=True
	for (k,v) in sorted(weights.items()):
		if not first:
			s += ", "
		s += "'%s':%5.1f"%(k,v)
		first=False
	s += "}"
	return s

language_weights = {}
for lc in language_codes:
	language_weights[lc] = 1.0

print("Initial weights:                       ", weights_str(language_weights))

if cc in country_codes:
	for lc in language_codes:
		w = country_language_weight[(cc,lc)]
		language_weights[lc] = language_weights[lc] * w
print("weights after country_language_weight: ", weights_str(language_weights))

blang_idx=0
for lc in blangs.split(','):
	if lc in language_codes:
		for lc2 in language_codes:
			w = language_intelligibility[(lc,lc2)]
			bcw = blang_count_weights[blang_idx]
			w2 = 1+((w-1) * ((bcw-1.0)/9.0))
			#print("@ lc=%s lc2=%s w=%5.2f bcw=%5.2f  w2=%5.2f"%(lc,lc2,w,bcw,w2))
			language_weights[lc2] = language_weights[lc2] * w2
	blang_idx += 1
print("weights after language_intelligibility:", weights_str(language_weights))


if len(query_words)>0:
	query_language_hits = {}
	for lc in language_codes:
		query_language_hits[lc] = 0
	for qw in query_words:
		if qw in dictionary:
			for lc in dictionary[qw]:
				query_language_hits[lc] += 1
	s = sum(query_language_hits.values())
	if s>0:
		for lc in language_codes:
			w = 1.0 + float(query_language_hits[lc])/s*5
			language_weights[lc] = language_weights[lc] * w
	print("weights after dictionary checks       :", weights_str(language_weights))

for qw in query_words:
	min_w={}
	max_w={}
	for lc in language_codes:
		min_w[lc]=1.0
		max_w[lc]=1.0
	for qc in qw:
		if qc in char_language_weight:
			for (lc,w) in char_language_weight[qc].items():
				min_w[lc] = min(min_w[lc],w)
				max_w[lc] = max(max_w[lc],w)
	#print("min_w=",min_w)
	#print("max_w=",max_w)
	for lc in language_codes:
		if min_w[lc]!=max_w[lc]:
			language_weights[lc] = language_weights[lc] * min_w[lc]
			language_weights[lc] = language_weights[lc] * max_w[lc]
print("weights after char-lang adjustments   :", weights_str(language_weights))
