#!/usr/bin/python
# process.py
# this script estimates the global prevalence of circumcision
# (C) Jake Waskett, 2011
import sys
import re
from string import Template
def parseSexRatio():
# extract sex ratio data from CIA HTML file
# this function parses sexratio.html, as originally downloaded from:
# https://www.cia.gov/library/publications/the-world-factbook/fields/2018.html
tab = {}
fh = open("sexratio.html")
lines = fh.readlines()
for line in lines:
# parse country
m = re.search('style="font-weight: bold;">([^<]*)', line)
if m:
country = m.group(1)
# parse M:F ratio
m = re.search('total population: ([^ ]*) ', line)
if m:
ratio = m.group(1)
tab[country] = float(ratio)
fh.close()
return tab
def parsePop():
# convert CIA text file to CSV
# this function parses pop.html, as originally downloaded from:
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119rank.html
tab = {}
fh = open("pop.html")
lines = fh.readlines()
for line in lines:
# parse country
m = re.search('([^<]*)', line)
if m:
country = m.group(1)
# parse M:F ratio
m = re.search('align="center" width="160"> *([0-9,]*)
', line)
if m:
pop = m.group(1).replace(',', '')
tab[country] = float(pop)
fh.close()
return tab
def parsePercentFile(filename):
# parse muslim.csv or jewish.csv
tab = {}
fh = open(filename)
lines = fh.readlines()
for line in lines:
m = re.match('"([^"]*)",([0-9.]+)', line)
if m:
tab[m.group(1)] = float(m.group(2))
fh.close()
return tab
def parseCircdata():
# parse circdata.csv
# format: "country",pc,"ref"
# returns a dictionary of tuples: avg, studyarray
# where studyarray is an array of (study, percent) tuples
tab = {}
# parse circdata.csv
fh = open("circdata.csv")
lines = fh.readlines()
for line in lines:
m = re.match('"([^"]*)",([0-9.]+),"([^"]*)"', line)
if m:
if m.group(1) not in tab:
tab[m.group(1)] = (None, [])
tab[m.group(1)][1].append((m.group(3), float(m.group(2))))
fh.close()
# compute per-country averages
for country in tab.keys():
total = 0.0
for i in range(len(tab[country][1])):
total += tab[country][1][i][1]
tab[country] = (total / len(tab[country][1]), tab[country][1])
return tab
def genCircdataTable(tab):
# format the circdata data structure as an HTML table
out = '
| Country | % circumcised | Source(s) |
\n'
for country in tab.keys():
out += '| %s | %.1f | ' % (country, tab[country][0])
for i in range(len(tab[country][1])):
ref = tab[country][1][i][0]
pc = tab[country][1][i][1]
if i > 0:
out += ', '
out += '%.1f%%@%s@' % (pc, ref)
out += ' |
\n'
out += '
\n'
return out
def getMalePop():
# combine parseSexRatio and parsePop to get male population figures
# returns a dictionary containing tuples: (totalpop, sexratio, malepop, substflag)
sr = parseSexRatio()
pop = parsePop()
for country in pop.keys():
if country not in sr:
# sex ratio unknown; substitute 1.01 (world average)
sr_country = 1.01
substflag = 'a'
else:
sr_country = sr[country]
substflag = ''
pop_country = pop[country]
# if kf+f=p then f=p/(k+1)
pop_f = pop_country / (sr_country + 1)
# and m=kf
pop_m = round(pop_f * sr_country)
pop[country] = (pop_country, sr_country, pop_m, substflag)
return pop
def combineSources(circdata):
# combine getMalePop and Muslim, Jewish, and Circdata sources
mpop = getMalePop()
muslim = parsePercentFile('muslim.csv')
jewish = parsePercentFile('jewish.csv')
excluded_list = []
for country in mpop.keys():
substflags = mpop[country][3]
# extract % muslim
if country in muslim:
mpc = muslim[country]
else:
mpc = 0
substflags += 'b'
# extract % jewish
if country in jewish:
jpc = jewish[country]
else:
jpc = 0
substflags += 'c'
# extract % circumcised
if country in circdata:
cpc = circdata[country][0]
else:
# substitute sum of muslim & jewish
cpc = mpc + jpc
substflags += 'd'
# check we've got some sensible data to use
if country not in muslim and country not in jewish and country not in circdata:
excluded_list.append((country, mpop[country][2]))
del mpop[country]
else:
# calculate number of circumcised men
cmen = round(mpop[country][2] * cpc / 100.0)
mpop[country] = (mpop[country][0], mpop[country][1], mpop[country][2], substflags, \
mpc, jpc, cpc, cmen)
return (mpop, excluded_list)
def genMainTable(tab):
# format the combined data sources as an HTML table
out = '| Country | Muslim % | Jewish % | Circumcised % | Total males | Total circ\'d males | Notes |
\n'
for country in tab.keys():
out += '| %s | %.1f | %.1f | %.1f | %.0f | %.0f | %s |
\n' % (country, tab[country][4], tab[country][5], tab[country][6], tab[country][2], tab[country][7], tab[country][3])
out += '
\n'
return out
def genExcludedList(tab):
# format the excluded country list as an HTML table
out = '| Country | Total males |
\n'
for i in range(len(tab)):
out += '| %s | %.0f |
\n' % (tab[i][0], tab[i][1])
out += '
\n'
return out
def calcTotals(tab):
# calculate totals
cmen_total = 0.0
mpop_total = 0.0
for country in tab.keys():
cmen_total += tab[country][7]
mpop_total += tab[country][2]
return (cmen_total, mpop_total, 100*(cmen_total/mpop_total))
def processRefs(src):
ref_to_num = {}
ref_list = []
out = ''
while src != '':
# find next reference
m = re.search('@(.*?)@', src)
if m:
# parse reference
out += src[:m.start(0)]
src = src[m.end(0):]
ref = m.group(1)
# is this the first reference to this ref?
if ref in ref_to_num:
refnum = ref_to_num[ref]
else:
refnum = len(ref_list) + 1
ref_to_num[ref] = refnum
ref_list.append(ref)
# format Vancouver-style ref number
out += '[%d]' % (refnum, refnum)
else:
out += src
src = ''
# parse refs.csv file
reftab = {}
fh = open("refs.csv")
lines = fh.readlines()
for line in lines:
m = re.match('"([^"]*)","([^"]*)"', line)
if m:
reftab[m.group(1)] = m.group(2).replace("''", '"')
fh.close()
# format reference list
out += '\n'
for refnum in range(len(ref_list)):
if ref_list[refnum] in reftab:
refstr = reftab[ref_list[refnum]]
out += '- %s
\n' % (refnum + 1, refstr)
else:
out += '- MISSING REF %s
\n' % ref_list[refnum]
out += '
\n'
return out
def genCompareCsv(tab):
# generate a CSV file containing countries with both
# religiously-predicted circumcision rates and recorded
# circumcision rates
fh = open("compare.csv", "w")
fh.write('"Country","Predicted","Actual","MPop"\n')
for country in tab.keys():
if tab[country][3].find('d') == -1:
fh.write('"%s",%f,%f,%f\n' % (country, tab[country][4] + tab[country][5], tab[country][6], tab[country][2]))
fh.close()
# main program
circdata = parseCircdata()
(combined_data, excluded_list) = combineSources(circdata)
mapping = {}
mapping['combined'] = genMainTable(combined_data)
mapping['circdata'] = genCircdataTable(circdata)
mapping['circtot'] = "%.0f" % calcTotals(combined_data)[0]
mapping['maletot'] = "%.0f" % calcTotals(combined_data)[1]
mapping['globalcirc'] = "%.1f" % calcTotals(combined_data)[2]
mapping['excltotal'] = len(excluded_list)
mapping['excllist'] = genExcludedList(excluded_list)
template_fh = open('template.html')
print processRefs(Template(template_fh.read()).substitute(mapping))
template_fh.close()
genCompareCsv(combined_data)