#!/usr/bin/python # process.py # this script estimates the global prevalence of circumcision # (C) Jake Waskett, 2011 import sys import re from string import Template def parseSexRatio(): # extract sex ratio data from CIA HTML file # this function parses sexratio.html, as originally downloaded from: # https://www.cia.gov/library/publications/the-world-factbook/fields/2018.html tab = {} fh = open("sexratio.html") lines = fh.readlines() for line in lines: # parse country m = re.search('style="font-weight: bold;">([^<]*)', line) if m: country = m.group(1) # parse M:F ratio m = re.search('total population: ([^ ]*) ', line) if m: ratio = m.group(1) tab[country] = float(ratio) fh.close() return tab def parsePop(): # convert CIA text file to CSV # this function parses pop.html, as originally downloaded from: # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119rank.html tab = {} fh = open("pop.html") lines = fh.readlines() for line in lines: # parse country m = re.search('([^<]*)', line) if m: country = m.group(1) # parse M:F ratio m = re.search('align="center" width="160">
*([0-9,]*)
', line) if m: pop = m.group(1).replace(',', '') tab[country] = float(pop) fh.close() return tab def parsePercentFile(filename): # parse muslim.csv or jewish.csv tab = {} fh = open(filename) lines = fh.readlines() for line in lines: m = re.match('"([^"]*)",([0-9.]+)', line) if m: tab[m.group(1)] = float(m.group(2)) fh.close() return tab def parseCircdata(): # parse circdata.csv # format: "country",pc,"ref" # returns a dictionary of tuples: avg, studyarray # where studyarray is an array of (study, percent) tuples tab = {} # parse circdata.csv fh = open("circdata.csv") lines = fh.readlines() for line in lines: m = re.match('"([^"]*)",([0-9.]+),"([^"]*)"', line) if m: if m.group(1) not in tab: tab[m.group(1)] = (None, []) tab[m.group(1)][1].append((m.group(3), float(m.group(2)))) fh.close() # compute per-country averages for country in tab.keys(): total = 0.0 for i in range(len(tab[country][1])): total += tab[country][1][i][1] tab[country] = (total / len(tab[country][1]), tab[country][1]) return tab def genCircdataTable(tab): # format the circdata data structure as an HTML table out = '\n' for country in tab.keys(): out += '\n' out += '
Country% circumcisedSource(s)
%s%.1f' % (country, tab[country][0]) for i in range(len(tab[country][1])): ref = tab[country][1][i][0] pc = tab[country][1][i][1] if i > 0: out += ', ' out += '%.1f%%@%s@' % (pc, ref) out += '
\n' return out def getMalePop(): # combine parseSexRatio and parsePop to get male population figures # returns a dictionary containing tuples: (totalpop, sexratio, malepop, substflag) sr = parseSexRatio() pop = parsePop() for country in pop.keys(): if country not in sr: # sex ratio unknown; substitute 1.01 (world average) sr_country = 1.01 substflag = 'a' else: sr_country = sr[country] substflag = '' pop_country = pop[country] # if kf+f=p then f=p/(k+1) pop_f = pop_country / (sr_country + 1) # and m=kf pop_m = round(pop_f * sr_country) pop[country] = (pop_country, sr_country, pop_m, substflag) return pop def combineSources(circdata): # combine getMalePop and Muslim, Jewish, and Circdata sources mpop = getMalePop() muslim = parsePercentFile('muslim.csv') jewish = parsePercentFile('jewish.csv') excluded_list = [] for country in mpop.keys(): substflags = mpop[country][3] # extract % muslim if country in muslim: mpc = muslim[country] else: mpc = 0 substflags += 'b' # extract % jewish if country in jewish: jpc = jewish[country] else: jpc = 0 substflags += 'c' # extract % circumcised if country in circdata: cpc = circdata[country][0] else: # substitute sum of muslim & jewish cpc = mpc + jpc substflags += 'd' # check we've got some sensible data to use if country not in muslim and country not in jewish and country not in circdata: excluded_list.append((country, mpop[country][2])) del mpop[country] else: # calculate number of circumcised men cmen = round(mpop[country][2] * cpc / 100.0) mpop[country] = (mpop[country][0], mpop[country][1], mpop[country][2], substflags, \ mpc, jpc, cpc, cmen) return (mpop, excluded_list) def genMainTable(tab): # format the combined data sources as an HTML table out = '\n' for country in tab.keys(): out += '\n' % (country, tab[country][4], tab[country][5], tab[country][6], tab[country][2], tab[country][7], tab[country][3]) out += '
CountryMuslim %Jewish %Circumcised %Total malesTotal circ\'d malesNotes
%s%.1f%.1f%.1f%.0f%.0f%s
\n' return out def genExcludedList(tab): # format the excluded country list as an HTML table out = '\n' for i in range(len(tab)): out += '\n' % (tab[i][0], tab[i][1]) out += '
CountryTotal males
%s%.0f
\n' return out def calcTotals(tab): # calculate totals cmen_total = 0.0 mpop_total = 0.0 for country in tab.keys(): cmen_total += tab[country][7] mpop_total += tab[country][2] return (cmen_total, mpop_total, 100*(cmen_total/mpop_total)) def processRefs(src): ref_to_num = {} ref_list = [] out = '' while src != '': # find next reference m = re.search('@(.*?)@', src) if m: # parse reference out += src[:m.start(0)] src = src[m.end(0):] ref = m.group(1) # is this the first reference to this ref? if ref in ref_to_num: refnum = ref_to_num[ref] else: refnum = len(ref_list) + 1 ref_to_num[ref] = refnum ref_list.append(ref) # format Vancouver-style ref number out += '
[%d]' % (refnum, refnum) else: out += src src = '' # parse refs.csv file reftab = {} fh = open("refs.csv") lines = fh.readlines() for line in lines: m = re.match('"([^"]*)","([^"]*)"', line) if m: reftab[m.group(1)] = m.group(2).replace("''", '"') fh.close() # format reference list out += '
    \n' for refnum in range(len(ref_list)): if ref_list[refnum] in reftab: refstr = reftab[ref_list[refnum]] out += '
  1. %s
  2. \n' % (refnum + 1, refstr) else: out += '
  3. MISSING REF %s
  4. \n' % ref_list[refnum] out += '
\n' return out def genCompareCsv(tab): # generate a CSV file containing countries with both # religiously-predicted circumcision rates and recorded # circumcision rates fh = open("compare.csv", "w") fh.write('"Country","Predicted","Actual","MPop"\n') for country in tab.keys(): if tab[country][3].find('d') == -1: fh.write('"%s",%f,%f,%f\n' % (country, tab[country][4] + tab[country][5], tab[country][6], tab[country][2])) fh.close() # main program circdata = parseCircdata() (combined_data, excluded_list) = combineSources(circdata) mapping = {} mapping['combined'] = genMainTable(combined_data) mapping['circdata'] = genCircdataTable(circdata) mapping['circtot'] = "%.0f" % calcTotals(combined_data)[0] mapping['maletot'] = "%.0f" % calcTotals(combined_data)[1] mapping['globalcirc'] = "%.1f" % calcTotals(combined_data)[2] mapping['excltotal'] = len(excluded_list) mapping['excllist'] = genExcludedList(excluded_list) template_fh = open('template.html') print processRefs(Template(template_fh.read()).substitute(mapping)) template_fh.close() genCompareCsv(combined_data)