Python

来自集智百科
跳转到: 导航搜索

目录

默认调用模块

本页包含的所有算法默认调用模块如下:

    import numpy as np
    import matplotlib.pyplot as plt
    import networkx as nx
    from collections import defaultdict

流网络算法

网络数据的读入与网络的绘制

示例流网络

    G=nx.DiGraph()
    G.add_edge('source',1,weight=80)
    G.add_edge(1,2,weight=50)
    G.add_edge(1,3,weight=30)
    G.add_edge(3,2,weight=10)
    G.add_edge(2,4,weight=20)
    G.add_edge(2,5,weight=30)
    G.add_edge(4,5,weight=10)
    G.add_edge(5,3,weight=5)
    G.add_edge(2,'sink',weight=10)
    G.add_edge(4,'sink',weight=10)
    G.add_edge(3,'sink',weight=25)
    G.add_edge(5,'sink',weight=35)

读入数据并构建流网络

示例数据:

 d=[['a',0],['a',1],['a',2],['b',1],['b',2],['c',1],['c',2],['c',3],['d',2],['d',3],
 ['e',0],['e',4],['f',0],['f',4],['g',0],['g',4],['g',5],['h',0],['h',5],['i',6]]


定义函数:

     def constructFlowNetwork (C):
        E=defaultdict(lambda:0)
        E[('source',C[0][1])]+=1
        E[(C[-1][1],'sink')]+=1
        F=zip(C[:-1],C[1:])
        for i in F:
            if i[0][0]==i[1][0]:
                E[(i[0][1],i[1][1])]+=1
            else:
                E[(i[0][1],'sink')]+=1
                E[('source',i[1][1])]+=1
        G=nx.DiGraph()
        for i,j in E.items():
            x,y=i
            G.add_edge(x,y,weight=j)
        return G

使用函数:

    G = constructFlowNetwork(d)

绘制带权重的网络

    n = G.number_of_edges()   
    pos = nx.spring_layout(G) 
    colors = range(n)
    nx.draw(G, pos, node_size = 5, node_color = "#A0CBE2",
            edge_color = colors, width = 1,edge_cmap = plt.cm.Blues, with_labels = False)  
    #plt.savefig("E:/exampleNetwork.png")

流网络基本统计量

查看网络中任意节点的出流和入流

出流

    def outflow(G,node):
        n=0
        for i in G.edges(node):
            n+=G[i[0]][i[1]].values()[0]
        return n

入流

    def inflow(G,node):
        return outflow(G.reverse(), node)

对网络进行流平衡

定义函数

    def flowBalancing(G):
        H = G.copy()
        O = G.out_degree(weight='weight')
        I = G.reverse().out_degree(weight='weight')
        for i in O:
            if i =='sink' or i=='source':
                continue
            de = I[i]-O[i]
            if de > 0:
                H.add_edge(i,'sink',weight=de)
            else:
                H.add_edge('source',i,weight=-de)
        return H

使用函数

    H = flowBalancing(G)

获得所有节点的耗散数据

定义函数

    def networkDissipate(G):
        D=collections.defaultdict(lambda:[0,0,0])#toSink,totalflow,fromSource
        for x,y in G.edges():
            w = G[x][y].values()[0]
            D[x][1]+=w
            if y == 'sink':
                D[x][0]+=w
            if x == 'source':
                D[y][2]+=w
        return D

使用函数

    D = networkDissipate(G)

计算网络的流矩阵

调用模块

    import networkx as nx

定义函数

    def getFlowMatrix(G,nodelist=None):
    '''
    read Graph and construct flowMatrix
    '''
        if nodelist is None:
            FM = nx.to_numpy_matrix(G)
 
        FM = nx.to_numpy_matrix(G,nodelist)
        return FM

使用函数

 
    fM = getFlowMatrix(G)


计算网络的马尔科夫矩阵

调用模块

    import numpy as np

定义函数

    def getMarkovMatrix(m):
    '''
    read flowMatrix and construct MarkovMatrix      
    '''
        n = len(m)
        mm = np.zeros((n,n),np.float)
        for i in range(n):
            for j in range(n):
                if m[i,j]>0:
                    mm[i,j] = float(m[i,j])/float((m[i,0:].sum()))
 
        return mm

使用函数

 
    MM = getMarkovMatrix(fM)

计算无穷步长的累积马可夫转移(U)矩阵

调用模块

    from numpy import linalg as LA
    from numpy import delete

定义函数

    def getUmatrix(G):
        H = flowBalancing(G)
        F1=nx.to_numpy_matrix(H)
        sourcep=H.nodes().index('source')
        sinkp=H.nodes().index('sink')
        F1oJ=F1[sourcep,]
        if sinkp > sourcep:
            F1oJ=delete(F1oJ,sourcep,1)
            F1oJ=delete(F1oJ,sinkp-1,1)
            F1[sinkp,sinkp]=1
            M = F1 / F1.sum(axis=1)
            M = delete(M, sinkp, 0) 
            M = delete(M, sinkp, 1) 
            I = np.identity(len(M))
            U =  LA.inv( I - M)
            U = delete(U, sourcep, 0) 
            U = delete(U, sourcep, 1)
        else:
            F1oJ=delete(F1oJ,sinkp,1)
            F1oJ=delete(F1oJ,sourcep-1,1)
            F1[sinkp,sinkp]=1
            M = F1 / F1.sum(axis=1)
            M = delete(M, sinkp, 0) 
            M = delete(M, sinkp, 1) 
            I = np.identity(len(M))
            U =  LA.inv( I - M)
            U = delete(U, sourcep-1, 0) 
            U = delete(U, sourcep-1, 1)     
        return U

使用函数

 
    U = getUmatrix(G)

计算网络中流的平均长度

定义函数

    def averageFlowLength(G):
        H = flowBalancing(G)
        F1=nx.to_numpy_matrix(H)
        sinkp=H.nodes().index('sink')
        sourcep=H.nodes().index('source')
        F1[sinkp,sinkp]=1
        M = F1 / F1.sum(axis=1)
        M = np.delete(M, sinkp, 0) 
        M = np.delete(M, sinkp, 1) 
        I = np.identity(len(M))
        U =  LA.inv( I - M)
        if sinkp > sourcep:
            L = np.sum(U[sourcep,])
        else:
            L = np.sum(U[sourcep-1,])
        return L

使用函数

    L = averageFlowLength(G)

计算网络的平均时间矩阵

调用模块

    import networkx as nx

定义函数

    def getAverageTimeMatrix(G):
        H = flowBalancing(G)
 
        hn = H.nodes()
        hn.remove('source') 
        hn.insert(0,'source')
 
        m = getFlowMatrix(H,hn)
        M = getMarkovMatrix(m)
 
        U = getUmatrix(M)
        T = np.dot(U , U)
        T = np.dot(M , T)
        K = np.divide(T,U)
 
    return K

使用函数

 
    K = getAverageTimeMatrix(G)

问题:触发AttributeError: 'numpy.ndarray' object has no attribute 'reverse'

获得所有节点的Ai与Ci数据

调用模块

    from numpy import linalg as LA

定义函数

    def AICI(G):
        H = flowBalancing(G)
        F1=nx.to_numpy_matrix(H)
        sourcep=H.nodes().index('source')
        sinkp=H.nodes().index('sink')
        F1oJ=F1[sourcep,]
        AI = F1.sum(0)
        if sinkp > sourcep:
            F1oJ=delete(F1oJ,sourcep,1)
            F1oJ=delete(F1oJ,sinkp-1,1)
            AI=delete(AI,sourcep,1)
            AI=delete(AI,sinkp-1,1)
            F1[sinkp,sinkp]=1
            M = F1 / F1.sum(axis=1)
            M = delete(M, sinkp, 0) 
            M = delete(M, sinkp, 1) 
            I = np.identity(len(M))
            U =  LA.inv( I - M)
            U = delete(U, sourcep, 0) 
            U = delete(U, sourcep, 1)
        else:
            F1oJ=delete(F1oJ,sinkp,1)
            F1oJ=delete(F1oJ,sourcep-1,1)
            AI=delete(AI,sinkp,1)
            AI=delete(AI,sourcep-1,1)
            F1[sinkp,sinkp]=1
            M = F1 / F1.sum(axis=1)
            M = delete(M, sinkp, 0) 
            M = delete(M, sinkp, 1) 
            I = np.identity(len(M))
            U =  LA.inv( I - M)
            U = delete(U, sourcep-1, 0) 
            U = delete(U, sourcep-1, 1)     
        def calculateCi(i):
            Ci =sum(U[i,:])*sum(np.dot(F1oJ,U[:,i]))/U[i,i]
            return Ci
        CI = map( lambda x:calculateCi(x),range(len(U)) )
        AI = AI.tolist()[0]
        return np.transpose(np.vstack((AI,CI)))

使用函数

    AICIData = AICI(G)

网络重整化的几种方法s

计算网络的分形维:盒子覆盖法

调用模块

    from math import *
    from networkx import *
    import random
    from copy import deepcopy
    import os, sys
    import time
    import re
    import statsmodels.api as sm

函数版权说明

    #------网络下载地址:lev[dot]ccny[dot]cuny[dot]edu/~hmakse/modules[dot]py-----------------#
    '''
    Comments:
    If you have any questions or find a bug write Hernan Rozenfeld an email at
    hernanrozenfeld at domain gmail with the usual "dot com" at the end. 
    Code in Python (written by Hernan Rozenfeld):
    For this code to run you'll need to install Python and Networkx. 
    File "modules.py" contains algorithms MEMB, CBB, and random covering for
    network renormalization.
    '''

定义函数

分形盒子覆盖方法1 CBB

    def CBB(G,lb): #This is the compact box burning algorithm.
        	"""
        	It returns a dictionary with {box_id:subgraph_generated_by_the_nodes_in_this_box}
        	The box_id is the center of the box.
        	The subgraphs may be disconnected.
        	"""	
        	uncovered_nodes=G.nodes()
        	uncovered_nodes = set(uncovered_nodes)
        	covered_nodes = set([])
        	boxes_subgraphs = {}
        	adj = G.adj
        	while uncovered_nodes:
        		center = random.choice(list(uncovered_nodes))
        		nodes_visited = {center:0}
        		search_queue = [center]
        		d = 1
        		while len(search_queue) > 0 and d <= lb-1:
        			next_depth = []
        			extend = next_depth.extend
        			for n in search_queue:
        				l = [ i for i in adj[n].iterkeys() if i not in nodes_visited ]
        				extend(l)
        				for j in l: 
        					nodes_visited[j] = d
        			search_queue = next_depth
        			d += 1
        		new_covered_nodes = set(nodes_visited.keys())
        		new_covered_nodes = new_covered_nodes.difference(covered_nodes)
        		nodes_checked_as_centers = set([center])
        		while len(nodes_checked_as_centers) < len(new_covered_nodes):
        			secondary_center = random.choice(list(new_covered_nodes.difference(nodes_checked_as_centers)))
        			nodes_checked_as_centers.add(secondary_center)
        			nodes_visited = {secondary_center:0}
        			search_queue = [secondary_center]
        			d = 1
        			while len(search_queue) > 0 and d <= lb-1:
        				next_depth = []
        				extend = next_depth.extend
        				for n in search_queue:
        					l = [ i for i in adj[n].iterkeys() if i not in nodes_visited ] # faster than has_key? yep
        					extend(l)
        					for j in l:
        						nodes_visited[j] = d
        				search_queue = next_depth
        				d += 1
        			nodes_covered_by_secondary = set(nodes_visited.keys())
        			new_covered_nodes = new_covered_nodes.intersection(nodes_covered_by_secondary)
        		boxes_subgraphs[center] = subgraph(G,list(new_covered_nodes))
        		uncovered_nodes = uncovered_nodes.difference(new_covered_nodes)
        		covered_nodes = covered_nodes.union(new_covered_nodes)
        	return boxes_subgraphs

分形盒子覆盖方法2 RBC

    def random_box_covering(G,rb):
    	"""
    	It returns a dictionary with {box_id:subgraph_generated_by_the_nodes_in_this_box}
    	The box_id is the center of the box.
    	"""
    	H = deepcopy(G)
    	burned_nodes = []
    	unburned_nodes = G.nodes()
    	boxes_subgraphs = {}
    	adj = H.adj
    	while unburned_nodes:
    		center_node = random.choice(unburned_nodes) 
    		nodes_visited = [center_node]
    		search_queue = [center_node]
    		d = 1
    		while search_queue and d <= rb:
    			next_depth = []
    			extend = next_depth.extend
    			for n in search_queue:
    				l = [ i for i in adj[n].iterkeys() if i not in nodes_visited]
    				extend(l)
    				nodes_visited.extend(l)
    			search_queue = next_depth
    			d += 1
    		new_burned_nodes = nodes_visited#.keys()
    		H.remove_nodes_from(new_burned_nodes)
    		boxes_subgraphs[center_node] = subgraph(G,new_burned_nodes)
    		unburned_nodes = list(set(unburned_nodes)-set(new_burned_nodes))
    	return boxes_subgraphs

分形盒子覆盖方法3 MEMB

    def MEMB(G,rb,cycle=0):
        	"""
        	It returns a dictionary with {box_id:subgraph_generated_by_the_nodes_in_this_box}
        	The box_id is the center of the box.
        	cycle: Ignore this parameter. Use the default cycle=0.
        	"""
        	adj = G.adj
        	number_of_nodes = G.number_of_nodes()
        	covered_nodes = set()
        	center_nodes = set()
        	non_center_nodes = G.nodes()
        	center_node_found = 0
        	boxes={} #this will be "box_id:[nodes in box]"
        	central_distance_of_node = {} #"node:central_distance"
        	node_box_id = {} #"node:box_id"
        	nodes_sorted_by_central_distance={} #Dict with {central_distance:[nodes]}
        	excluded_mass_of_non_centers_rb = {} #This contains [(node:excluded_mass)] for rb
        	excluded_mass_of_non_centers_rb2 = {} #This contains [(node:excluded_mass)] for rb+1
        	rb2 = rb + 1
        	for node in non_center_nodes:
        		#if node in [5000,10000,20000,30000]: print "node", node
        		level=0                  # the current level
        		nextlevel={node:1}       # list of nodes to check at next level
        		paths_rb=None
        		paths_rb2={node:[node]} # paths dictionary  (paths to key from source)
        		while nextlevel:
        			paths_rb = deepcopy(paths_rb2)
        			thislevel=nextlevel
        			nextlevel={}
        			for v in thislevel:
        				for w in G.neighbors(v):
        					if not paths_rb2.has_key(w):
        						paths_rb2[w]=paths_rb2[v]+[w]
        						nextlevel[w]=1
        			level=level+1
        			if (rb2 <= level):  break
        		excluded_mass_of_node = len(paths_rb2)
        		try:
        			excluded_mass_of_non_centers_rb2[excluded_mass_of_node].append(node)
        		except KeyError:
        			excluded_mass_of_non_centers_rb2[excluded_mass_of_node] = [node]
        		excluded_mass_of_node = len(paths_rb)
        		try:
        			excluded_mass_of_non_centers_rb[excluded_mass_of_node].append(node)
        		except KeyError:
        			excluded_mass_of_non_centers_rb[excluded_mass_of_node] = [node]
        	maximum_excluded_mass = 0
        	nodes_with_maximum_excluded_mass=[]
        	new_covered_nodes = {}
        	center_node_and_mass = []
        	cycle_index = 0
        	while len(covered_nodes) < number_of_nodes:
        		#print len(covered_nodes),number_of_nodes
        		cycle_index += 1
        		if cycle_index == cycle:
        			rb2 = rb+1
        			cycle_index = 0
        		else:
        			rb2 = rb
        		while 1:
        			if rb2 == rb+1:
        				#t1=time.time()
        				while 1:
        					maximum_key = max(excluded_mass_of_non_centers_rb2.keys())
        					node = random.choice(excluded_mass_of_non_centers_rb2[maximum_key])
        					if node in center_nodes:
        						excluded_mass_of_non_centers_rb2[maximum_key].remove(node)
        						if not excluded_mass_of_non_centers_rb2[maximum_key]: del excluded_mass_of_non_centers_rb2[maximum_key]
        					else:
        						break	
        				nodes_visited = {}
        				bfs = single_source_shortest_path(G,node,cutoff=rb2)
        				for i in bfs:
        					nodes_visited[i] = len(bfs[i])-1
        				excluded_mass_of_node = len(set(nodes_visited.keys()).difference(covered_nodes))
        				if excluded_mass_of_node == maximum_key:
        					center_node_and_mass = (node,maximum_key)
        					excluded_mass_of_non_centers_rb2[maximum_key].remove(node)
        					if not excluded_mass_of_non_centers_rb2[maximum_key]: del excluded_mass_of_non_centers_rb2[maximum_key]
        					new_covered_nodes = nodes_visited
        					break
        				else:
        					excluded_mass_of_non_centers_rb2[maximum_key].remove(node)
        					if not excluded_mass_of_non_centers_rb2[maximum_key]: del excluded_mass_of_non_centers_rb2[maximum_key]
        					try:
        						excluded_mass_of_non_centers_rb2[excluded_mass_of_node].append(node)
        					except KeyError:
        						excluded_mass_of_non_centers_rb2[excluded_mass_of_node] = [node]
        				#print "time", time.time()-t1
        			else:
        				#t1=time.time()
        				while 1:
        					maximum_key = max(excluded_mass_of_non_centers_rb.keys())
        					node = random.choice(excluded_mass_of_non_centers_rb[maximum_key])
        					if node in center_nodes:
        						excluded_mass_of_non_centers_rb[maximum_key].remove(node)
        						if not excluded_mass_of_non_centers_rb[maximum_key]: del excluded_mass_of_non_centers_rb[maximum_key]
        					else:
        						break	
        				nodes_visited = {}
        				bfs = single_source_shortest_path(G,node,cutoff=rb)
        				for i in bfs:
        					nodes_visited[i] = len(bfs[i])-1
        				excluded_mass_of_node = len(set(nodes_visited.keys()).difference(covered_nodes))
        				if excluded_mass_of_node == maximum_key:
        					center_node_and_mass = (node,maximum_key)
        					excluded_mass_of_non_centers_rb[maximum_key].remove(node)
        					if not excluded_mass_of_non_centers_rb[maximum_key]: del excluded_mass_of_non_centers_rb[maximum_key]
        					new_covered_nodes = nodes_visited
        					break
        				else:
        					excluded_mass_of_non_centers_rb[maximum_key].remove(node)
        					if not excluded_mass_of_non_centers_rb[maximum_key]: del excluded_mass_of_non_centers_rb[maximum_key]
        					try:
        						excluded_mass_of_non_centers_rb[excluded_mass_of_node].append(node)
        					except KeyError:
        						excluded_mass_of_non_centers_rb[excluded_mass_of_node] = [node]
        				#print "time", time.time()-t1
        #					
        		center_node_found = center_node_and_mass[0]
        		boxes[center_node_found] = [center_node_found]
        		node_box_id[center_node_found] = center_node_found
        		non_center_nodes.remove(center_node_found)
        		center_nodes.add(center_node_found)
 
        		covered_nodes = covered_nodes.union(set(new_covered_nodes.keys()))
        		#print len(covered_nodes)
        		for i in new_covered_nodes:
        #			
        			try:
        				if central_distance_of_node[i] > new_covered_nodes[i]:
        					nodes_sorted_by_central_distance[central_distance_of_node[i]].remove(i)
        					if not nodes_sorted_by_central_distance[central_distance_of_node[i]]:
        						del nodes_sorted_by_central_distance[central_distance_of_node[i]]
        					try:
        						nodes_sorted_by_central_distance[new_covered_nodes[i]].append(i)
        					except KeyError:
        						nodes_sorted_by_central_distance[new_covered_nodes[i]] = [i]
        					central_distance_of_node[i] = new_covered_nodes[i]
        			except KeyError:
        				central_distance_of_node[i] = new_covered_nodes[i]
        				try:
        					nodes_sorted_by_central_distance[new_covered_nodes[i]].append(i)
        				except:
        					nodes_sorted_by_central_distance[new_covered_nodes[i]] = [i]
        #
        	max_distance = max(nodes_sorted_by_central_distance.keys())
        	for i in range(1,max_distance+1):
        		for j in nodes_sorted_by_central_distance[i]:
        			targets = list(set(adj[j].iterkeys()).intersection(set(nodes_sorted_by_central_distance[i-1])))
        			node_box_id[j] = node_box_id[random.choice(targets)]
        			boxes[node_box_id[j]].append(j)
        	boxes_subgraphs={}
        	for i in boxes:
        		boxes_subgraphs[i] = subgraph(G,boxes[i])
        #	
        	return boxes_subgraphs

使用函数

    def alloRegressPlot(xdata,ydata,col,xlab,ylab):
        x=xdata;y=ydata;
        xx = sm.add_constant(x, prepend=True)
        res = sm.OLS(y,xx).fit()
        constant=res.params[0];beta=res.params[1]; r2=res.rsquared
        plt.plot(xdata,ydata, "o",color=col,markersize=5)
        plt.plot(xdata,constant+xdata*beta,"red")
        plt.text(min(xdata)+(max(xdata)-min(xdata))/10,min(ydata)+(max(ydata)-min(ydata))/2,str(np.round(beta,2)))
        plt.xlabel(xlab)
        plt.ylabel(ylab)
    def fractalDimension(G):
        data = box_counting_dim(G,lmax=20,num_covers=1)
        alloRegressPlot(np.log(np.array(data.keys())),np.log(np.array(data.values())),"b","l","N")
 
    d = fractalDimension(G)

分形盒子覆盖方法4 RB by BEDWARDS

函数版权及说明

    #------网络下载地址:networkx[dot]lanl[dot]gov/trac/ticket/359-----------------#
    """Return the fractal box counting dimension
    Returns the box counting dimension given by:
        "...the network is covered with Nb boxes, where all
        vertices in each of them are connected by a minimum
        distance smaller than lb. Nb and lb are found to be
        related by
            Nb proportional lb^-db
        where db is the fractal box dimension of the network."
    Note that Nb should be the minimal cover (ie inf(all Nb))
    however, [2] note that
        "...we find that minimization is not relevant and
        any covering gives rise to the same exponenant."
    It is unclear in the paper whether this is only for networks
    that exhibit a scale free property, or for all networks
    so use with caution. We provide a value to calculate multiple
    random covers for each l. It will then take the minimum.
    Parameters
    ----------
    G          : graph
                 The graph used to compute the fractal cluster dimension.
    lmax       : Int
                 Max L to calculate Nb
    num_covers : int
                 Number of covers to calculate for each l
    Returns
    -------       
    Nb(l) : dict of ints
            The number of boxes need to cover the nodes given
            boxes of size l
    References
    ----------
    .. [1] L. da F. Costa, F.A. Rodrigues, G. Travieso, P.R. Villas Boas
           "Characterization of complex networks: A survey of
           measurements"
           Advances in Physics, Volume 56, Issue 1, Jan 2007
           pages 167-242
       [2] C. Song, S. Havlin and H.A. Makse, Nature 433 392 (2005).
    """

定义函数

    def size_box_cover(G,l):
        Nbl = 0
        not_covered = set(G.nodes())
        while not (list(not_covered)==[]):
            n = random.choice(list(not_covered))
            sp = nx.single_source_shortest_path_length(G,n,cutoff=l)
            box_nodes = set(sp.keys())
            not_covered.difference_update(box_nodes)
            Nbl = Nbl + 1
        return Nbl
 
    def box_counting_dim(G,lmax=None,num_covers=1):
        if (lmax==None):
            lmax = nx.diameter(G)
        Nb = {}
        for l in range(1,lmax+1):
            Nblmin = G.order()
            for n in range(num_covers):
                Nbl = size_box_cover(G,l)
                if (Nbl <Nblmin):
                    Nblmin = Nbl
            Nb[l] = Nblmin
        return Nb

计算网络的分形维:球形增长法

分形球状增长方法1. by BEDWARDS

函数版权及说明

    #------网络下载地址:networkx[dot]lanl[dot]gov/trac/ticket/359-----------------#
    """Return the fractal cluster dimension
    Returns the cluster dimension for a network. Given by
        "...a seed vertex is chosen at random and a cluster
        is formed by vertices distance at most l from the
        seed. This process is repeated many times and the
        average mass of the resulting clusters is calculated
        as a function of l, resulting the the relation
        <Mc> proportional l^df
        where the average mass <Mc> is defined as the  number
        of vertices in the cluster and df is the fractal
        cluster dimension"[1]
    Parameters
    ----------
    G        : graph
               The graph used to compute the fractal cluster dimension.
    nsamples : int
               Number of random nsamples to use for each l
    lmax     : Int
               Largest l to calculate dimension for
    Returns
    -------       
    Mc(l) : dict of floats
            The average mass for each l given in a dictionary keyed
            by l.
    References
    ----------
    .. [1] L. da F. Costa, F.A. Rodrigues, G. Travieso, P.R. Villas Boas
           "Characterization of complex networks: A survey of
           measurements"
           Advances in Physics, Volume 56, Issue 1, Jan 2007
           pages 167-242
    """

定义函数

    def average_cluster_mass(G,l,nsamples=10):
        try:
            nodes = random.sample(G.nodes(), nsamples)
        except ValueError:
            raise nx.NetworkXError("nsamples larger than number of nodes")
        mass = 0.0
        for n in nodes:
            sp = nx.single_source_shortest_path_length(G,n,cutoff=l)
            mass+= len(sp) - 1
        return float(mass)/len(nodes)
 
    def cluster_dim(G,nsamples=10,lmax=None):
        if (lmax==None):
            lmax = nx.diameter(G)
        Mc = {}
        for l in range(1,lmax+1):
            Mc[l] = average_cluster_mass(G,l,nsamples)
        return Mc

在重整化过程中考虑方向与权重

考虑方向

    构建网络时使用
    G = nx.DiGraph()
    与之对比,不考虑方向则使用
    H = nx.Graph() 或 H = G.to_undirected()

考虑权重

    计算盒子中心到盒子内部其他节点距离时
    将 sp = nx.single_source_shortest_path_length(G,n,cutoff=l) 
    替换为 sp = nx.single_source_shortest_path_length(G,n,cutoff=l)

计算重整化过程中的网络基本统计量

调用模块

    import statsmodels.api as sm

定义重整化函数

    def renormalizeMapping(G,l):
        replaceDic={}
        not_covered = set(G.nodes())
        while not (list(not_covered)==[]):
            n = random.choice(list(not_covered))
            sp = nx.single_source_shortest_path_length(G,n,cutoff=l)
            #sp = nx.single_source_dijkstra_path_length(G,source=n,cutoff=l,weight='weight')
            box_nodes = set(sp.keys())
            for i in list(box_nodes):
                replaceDic[i]=n
            not_covered.difference_update(box_nodes)
        return replaceDic

定义在重整化过程中的统计量

 
    def renormalizeStat(G,l):
        mapping = renormalizeMapping(G,l)
        G = nx.relabel_nodes(G,mapping)
        G.remove_edges_from(G.selfloop_edges())
        n = nx.number_of_nodes(G)
        m = nx.number_of_edges(G)
        w = G.size(weight='weight')
        return [n,m,w]

为分离成多个component的流网络定制的重整化统计

    def renormalizeStat(G,l):
        if "sink" in G.nodes():
            G.remove_node("sink")
        g = nx.weakly_connected_component_subgraphs(G)[0]
        mapping = renormalizeMapping(g,l)
        g = nx.relabel_nodes(g,mapping)
        g.remove_edges_from(g.selfloop_edges())
        n = nx.number_of_nodes(g)
        m = nx.number_of_edges(g)
        h = flowBalancing(g)
        uv = outflow(h,"source")
        pv = h.size(weight='weight')-uv
        return [n,m,uv,pv]

网络生长模型

Song et al. 分形小世界生长模型

    def fractal_model(generation,m,x,e):
        	"""
        	Returns the fractal model introduced by 
        	Song, Havlin, Makse in Nature Physics 2, 275.
        	generation = number of generations
        	m = number of offspring per node
        	x = number of connections between offsprings
        	e = probability that hubs stay connected
        	1-e = probability that x offsprings connect.
        	If e=1 we are in MODE 1 (pure small-world).
        	If e=0 we are in MODE 2 (pure fractal).
        	"""
        	G=Graph()
        	G.add_edge(0,1) #This is the seed for the network (generation 0)
        	node_index = 2
        	for n in range(1,generation+1):
        		all_links = G.edges()
        		while all_links:
        			link = all_links.pop()
        			new_nodes_a = range(node_index,node_index + m)
        			#random.shuffle(new_nodes_a)
        			node_index += m
        			new_nodes_b = range(node_index,node_index + m)
        			#random.shuffle(new_nodes_b)
        			node_index += m
        			G.add_edges_from([(link[0],node) for node in new_nodes_a])
        			G.add_edges_from([(link[1],node) for node in new_nodes_b])
        			repulsive_links = zip(new_nodes_a,new_nodes_b)
        			G.add_edges_from([repulsive_links.pop() for i in range(x-1)])
        			if random.random() > e:
        				G.remove_edge(*link)
        				G.add_edge(*repulsive_links.pop())
    	return G

布朗运动

    def randomStep():
        w = random.vonmisesvariate(0, 0)
        r = random.normalvariate(0, 1)
        x = r*np.cos(w)
        y = r*np.sin(w)
        return np.array([x,y])
 
    x=0;y=0
    pos=[]
    for i in range(1000):
        pos.append([x,y])
        t = randomStep()
        x += t[0]
        y += t[1]
 
    pos = np.array(pos)
    plt.plot(pos[:,0],pos[:,1],"bo-")


网络数据抓取

网页分析的Toy Model

这里的评论区下载一个示例的html文件,然后测试以下代码:

调用模块

    from bs4 import BeautifulSoup


使用函数

 
    with open ("E:/wulingfei/example.html", "r") as myfile:
        data=myfile.read().replace('\n', '')
 
    soup=BeautifulSoup(data)
    soup.select("title") # get title tag
    soup.select("body a") # all a tag inside body
    soup.select("html head title") # html->head->title
    soup.select("head > title") # head->title
    soup.select("p > a") # all a tag that inside p
    soup.select("body > a")# all a tag inside body
    soup.select(".sister") # select by class
    soup.select("#link1") # select by id
    soup.select('a[href="http://example.com/elsie"]')# find tags by attribute value
    soup.select('a[href^="http://example.com/"]') # find tags by attribute value, all contains 'http://example.com/'
    soup.select('p[lang|=en]') # Match language codes

从Alexa上抓取一个网站的上流网站数据

调用模块

    from bs4 import BeautifulSoup
    import urllib2
    import sys

定义函数

    def upstream(site):
        url='http://www.alexa.com/siteinfo/'+site
        a = urllib2.urlopen(url).read()
        b = a.split(r'keywords_upstream_site_table')[1]
        c = re.compile(r'<[^>]+>').sub('',b)
        d = c.split(r'%')
        # get rank
        traffic = 1/float(a.split(r'''alt='Global rank icon'><strong class="metrics-data align-vmiddle">''')\
                [1].split('<')[0])
        # get upstreams
        e = [i.split(r'&#46;&nbsp;&nbsp')[1] for i in d[:-1]]
        f=[(i.split(r' ')[0],traffic*float(i.split(r' ')[-1])/100) for i in e]
        return traffic, f

使用函数

    edgelist = {}
    new = ['google.com']
    old = {}
    step=0
    while step < 3:
        for seed in new:
            flushprint('step'+str(step)+'_'+seed+'_'+str(new.index(seed))+'_in_'+str(len(new)))
            try:
                if seed in old:
                    new.remove(seed)
                    continue
                traffic, f = upstream(seed)
                for up,weight in f:
                    edgelist[(up,seed)]=weight
                    new.append(up)
                old[seed]=traffic
            except:
                pass
        step += 1

从Baidu API 获得一个城市(中文输入)的经纬度坐标并在地图上绘制

    import json
    import urllib2
    import time
    import random
    from mpl_toolkits.basemap import Basemap
 
    def getLocation(cityName):
        key="3aa105f287af8365fbe2abd01ad9a67b"
        baiduGeoAPI="http://api.map.baidu.com/geocoder/v2/?address="\
        + cityName.decode('gbk').encode('utf-8') + "&output=json&ak=" + key
        jsonData = urllib2.urlopen(baiduGeoAPI).read()
        time.sleep(random.random()/1000)
        jd=json.loads(jsonData)
        if jd['result']:
            lattitude = jd['result']['location']['lat']
            longtitude = jd['result']['location']['lng']
            return [lattitude,longtitude]
        else:
            return [0,0]
 
    def basemapScatterPlot(lats, longts, sizes, cols):
        fig = pl.figure(1,figsize=(8,8))
        ax = fig.add_subplot(111)
        #ax.patch.set_facecolor('Black')
        #ax.patch.set_alpha(0.5)
        m = Basemap(projection='cyl', llcrnrlat=3,urcrnrlat=54, 
                    llcrnrlon=73,urcrnrlon=136, resolution='c')
        m.drawcoastlines(); m.drawcountries()
        cols=cols/max(cols)
        ax.scatter(longts, lats, s= sizes , c=1-cols,cmap=pl.get_cmap('Spectral'),
                            marker='o', edgecolors='None',label='')
        pl.draw()


从Google API 获得一个城市(中文输入)的经纬度坐标并在地图上绘制

    import pygmaps
    import matplotlib.pylab as pl
 
    def gmapScatterPlot(centerCity, ZoomScale, lats, lonts, sizes):
        center=getLocation(centerCity)
        mymap = pygmaps.maps(center[0], center[1], ZoomScale)
        for i in range(len(sizes)):
            mymap.addradpoint(lats[i], lonts[i], sizes[i]*1000, "#FF0000")
        mymap.draw('E:/wulingfei/mymap.html')
        webbrowser.open('E:/wulingfei/mymap.html')

常用数据分析工具

Pearson 相关

调用模块

    from scipy.stats.stats import pearsonr

使用函数

    pearsonr(x, y)

一元最小二乘回归

调用模块

    import statsmodels.api as sm

回归参数拟合

   def OLSRegressFit(x,y):
       xx = sm.add_constant(x, prepend=True)
       res = sm.OLS(y,xx).fit()
       constant, beta = res.params
       r2 = res.rsquared
       return [constant, beta, r2]

数据点与回归线绘制

   def OLSRegressPlot(x,y,col,xlab,ylab):
       xx = sm.add_constant(x, prepend=True)
       res = sm.OLS(y,xx).fit()
       constant, beta = res.params
       r2 = res.rsquared
       plt.plot(x,y, "o",color=col,markersize=5)
       plt.plot(x,constant+x*beta,"red")
       plt.text(min(x)+(max(x)-min(x))/10,min(y)+(max(y)-min(y))/2,str(np.round(beta,2)))
       plt.xlabel(xlab)
       plt.ylabel(ylab)

一元正交回归

调用模块

    import scipy.odr

回归参数拟合

    def othorgonalRegressFit(x,y):
        def f(p, x):
            return p[0]+p[1]*x
        myModel = scipy.odr.Model(f)
        myData = scipy.odr.Data(x, y)
        myOdr = scipy.odr.ODR(myData, myModel , beta0=[1,1])
        myOdr.set_job(fit_type=0) 
        out = myOdr.run()
        constant, beta = out.beta
        SSE = sum((constant + x*beta - y)**2)
        SST = SSE + sum((y-np.mean(y))**2)
        r2 = 1-SSE/SST
        return [constant, beta, r2]

数据点与回归线绘制

    def othorgonalRegressPlot(x, y, col, xlab, ylab):
        def f(p, x):
            return p[0] + p[1]*x
        myModel = scipy.odr.Model(f)
        myData = scipy.odr.Data(x, y)
        myOdr = scipy.odr.ODR(myData, myModel , beta0=[1,1])
        myOdr.set_job(fit_type=0) 
        out = myOdr.run()
        constant, beta = out.beta
        plt.plot(x, y, "o", color=col, markersize=5)
        plt.plot(x,constant + x * beta,"red")
        plt.text(min(x)+(max(x)-min(x))/10,min(y)+(max(y)-min(y))/2,str(np.round(beta,2)))
        plt.xlabel(xlab)
        plt.ylabel(ylab)

多元最小二乘法回归

调用模块

    import statsmodels.api as sm

回归参数拟合

    def OLSMRegressFit(x1, x2, y):
        x = np.column_stack((x1,x2))
        x = sm.add_constant(x, prepend=True)
        res = sm.OLS(y,x).fit()
        constant, beta1, beta2 = res.params
        return [constant, beta1, beta2]

log-bin合并数据

def BinData(x, y, number): 
    data=sorted(zip(x,y))
    rs = np.linspace(min(x),max(x),number)
    rs = np.transpose(np.vstack((rs[:-1],rs[1:])))
    ndata = []
    within = []
    for start,end in rs:
        for i,j in data:
            if i>=start and i<end:
                within.append(j)
        ndata.append([(start+end)/2.0,np.mean(np.array(within))]  )
    nx,ny = np.array(ndata).T
    return nx,ny

经过测试下面的函数更好,推荐使用log-binning方法,不使用linear-binning方法,参见Barabasi 2016 Network Science. Cambridge[1]

%matplotlib inline
import networkx as nx
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np
 
def log_binning(x, y, bin_count=35):
    max_x = np.log10(max(x))
    max_y = np.log10(max(y))
    max_base = max([max_x,max_y])
    xx = [i for i in x if i>0]
    min_x = np.log10(np.min(xx))
    bins = np.logspace(min_x,max_base,num=bin_count)
    # Based on: http://stackoverflow.com/questions/6163334/binning-data-in-python-with-scipy-numpy
    bin_means_y = (np.histogram(x,bins,weights=y)[0] / np.histogram(x,bins)[0])
    bin_means_x = (np.histogram(x,bins,weights=x)[0] / np.histogram(x,bins)[0])
    return bin_means_x,bin_means_y
 
# test
 
G = nx.barabasi_albert_graph(10000,3)
degs = defaultdict(int)
for i in G.degree().values(): degs[i]+=1
items = sorted ( degs.items () )
x, y = np.array(items).T
y_sum = np.sum(y)
y = [float(i)/y_sum for i in y]
x_bin, y_bin = log_binning(x, y, 10)
plt.plot(x, y, 'b-o')
plt.scatter(x_bin, y_bin, color = 'r', marker = 's', s= 100)
plt.xscale('log')
plt.yscale('log')
plt.legend(['Degree'])
plt.xlabel('$K$', fontsize = 20)
plt.ylabel('$P(K)$', fontsize = 20)
plt.title('$Degree\,Distribution$', fontsize = 20)
plt.show()

Logbinning.png

linear-bin合并数据

    def linearBin(x,y):
        a=[]
        q=sorted(zip(map(int,x),y))
        tag = ''
        d=[]
        for i in q:
            x1,y1 = i
            if x1 == tag: 
                d.append(y1)
            else:   
                if tag:   
                    a.append([tag,np.mean(d)])
                tag = x1
                d = []
        a.append([tag,np.mean(d)])
        nx,ny = np.array(a).T
        return nx,ny

DGBD分布拟合

调用模块

    import statsmodels.api as sm

参数拟合

    def DGBDfFit(data):
        t=array(sorted(data,key=lambda x:-x))
        r=array(range(1,len(data)+1))   
        y = log(t)
        x1 = log(max(r)+1-r)
        x2 = log(r)
        x = np.column_stack((x1,x2))
        x = sm.add_constant(x, prepend=True)
        res = sm.OLS(y,x).fit()
        [A,b,a] = res.params
        return [A,b,a]

绘制数据点和拟合曲线

    def DGBDPlot(data):
        t=array(sorted(data,key=lambda x:-x))
        r=array(range(1,len(data)+1))   
        y = log(t)
        x1 = log(max(r)+1-r)
        x2 = log(r)
        x = np.column_stack((x1,x2))
        x = sm.add_constant(x, prepend=True)
        res = sm.OLS(y,x).fit()
        [A,b,a] = res.params
        plt.plot(r,t,"o",color="b")
        plt.plot(r,exp(A)*(max(r)+1-r)**b*r**a,"r-")
        plt.yscale('log')
        plt.text(max(r)/2,max(t)/100,"b=" + str(round(b,2)) + ", a=" + str(round(a,2)))

幂律指数降尾的累积分布拟合

调用模块

    import statsmodels.api as sm
    import random

参数拟合

    def powerLawExponentialCutOffFit(data):
        t = np.array(sorted(data,key=lambda x:-x))
        r = np.array(range(len(data))) +1
        r = r/float(np.max(r))
        y = np.log(r)
        x1 = np.log(t+1)
        x2 = t
        x = np.column_stack((x1,x2))
        x = sm.add_constant(x, prepend=True)
        res = sm.OLS(y,x).fit()
        L,alpha,lambde = res.params
        return [L,alpha,lambde]

绘制数据点与拟合曲线

    def powerLawExponentialCutOffPlot(data):
        t = np.array(sorted(data,key=lambda x:-x))
        r = np.array(range(len(data))) +1
        r = r/float(np.max(r))
        y = np.log(r)
        x1 = np.log(t)
        x2 = t
        x = np.column_stack((x1,x2))
        x = sm.add_constant(x, prepend=True)
        res = sm.OLS(y,x).fit()
        L,alpha,lambde = res.params
        plt.plot(t,r,".",color="SteelBlue",alpha=0.75,markersize=3)
        plt.plot(t,exp(L) * t ** alpha * exp(lambde * t),"r-")
        plt.xscale('log'); plt.yscale('log')
        plt.xlabel(r'value ')
        plt.ylabel(r'cumulative probability')
        return [L,alpha,lambde]

在图上标注公式

    plt.text(10**1,10**-4, r"$f(x) \propto x^{\alpha}e^{\lambda x}$",size=25) 
    plt.text(10**1,10**-5, r"$\alpha = - 0.66$" + " "+r"$\lambda = -2.85e-06$",size=20)

python科学计算小技巧

读入以"\t"分隔的文件(直接将整个文件读入内存)

    data = []
    f = open('E:/xxx','rb')
    for line in f.readlines():
        line = line.strip().split("\t")
        data.append(line)
    f.close()

以迭代式方法读入"\t"分隔文件(不直接将整个文件读入内存)

	data = {}
	n=0
	with open('xxx','rb') as f:
		for line in f:
			n+=1
			line = line.strip().split('\t')
			d=line[1]
			if d in data:
				data[d]+=1
			else:
				data[d]=1
			if n%10000 == 0:
				print str(int(n/10000))

或者

    with open('D:/bitcoindata/txin.txt') as f:
        head=[f.next() for x in xrange(10)]

快速以"\t"分隔来写文件

    f = open('xxx', "wb")
    for i in data:
        f.write('\t'.join(map(str,i)) + '\n')
    f.close()

将一堆list合并成array里的列

    t = np.vstack((a,b,c)).T
    或 t = zip(a,b,c)

np.array中选择所有元素均大于0的行

    d = d[d.all(1)]

矩阵按行归一化

    def normalizebyRow(a):
        row_sums = a.sum(axis=1)
        new_matrix = a / row_sums[:, np.newaxis]
        return new_matrix

在GBK和UTF编码机制之间切换

    s = s.decode('gb2312').encode('utf-8')

随机生成n个0-1之间的数

        x = np.random.rand(n)

在整数n前面附加一串0,形成长度为k的字符串

        str(n).zfill(k)

解析字符串日期

调用模块

    from datetime import datetime


    timestamp = "1322485986"
    t = datetime.fromtimestamp( int(timestamp) )
    d = t.strftime('%Y-%m-%d')
    s="2012-01-01"
    time.mktime(datetime.strptime(s,'%Y-%m-%d').timetuple())

获得连续的天数

调用模块

    import datetime

定义函数

    def getDays(year1, month1, day1, year2, month2, day2):
        dt = datetime.datetime(year1, month1, day1)
        end = datetime.datetime(year2, month2, day2)
        step = datetime.timedelta(days=1)
        days = []
        while dt < end:
            days.append(dt.strftime('%Y%m%d'))
            dt += step
        return days

使用函数

    days = getDays(2013, 2, 27, 2013, 4, 27)

绘制直方图

定义函数

    def histogram (data,intervals,color,xlab):
        weights = np.ones_like(data)/len(data)
        plt.hist(data, intervals, facecolor = color, weights=weights, alpha = 0.75)
        plt.xlabel(xlab)

使用函数

    histogram (x,30,'green',r'$\theta$')

绘制多个小图

    figure(num=None, figsize=(12, 4), dpi=80, facecolor='w', edgecolor='k')      
    plt.subplot(131)                                                             
    plt.plot(1,1,"bo")
    plt.subplot(132)
    plt.plot(1,1,"ro")
    plt.subplot(133)
    plt.plot(1,1,"mo")
    plt.tight_layout(pad=0.4, w_pad=0, h_pad=1.0)

在多个小图中插入inset图

先定义一个函数:

    def add_subplot_axes(ax,rect,axisbg='w'):
        fig = plt.gcf()
        box = ax.get_position()
        width = box.width
        height = box.height
        inax_position  = ax.transAxes.transform(rect[0:2])
        transFigure = fig.transFigure.inverted()
        infig_position = transFigure.transform(inax_position)    
        x = infig_position[0]
        y = infig_position[1]
        width *= rect[2]
        height *= rect[3]  # <= Typo was here
        subax = fig.add_axes([x,y,width,height],axisbg=axisbg)
        x_labelsize = subax.get_xticklabels()[0].get_size()
        y_labelsize = subax.get_yticklabels()[0].get_size()
        x_labelsize *= rect[2]**0.5
        y_labelsize *= rect[3]**0.5
        subax.xaxis.set_tick_params(labelsize=x_labelsize)
        subax.yaxis.set_tick_params(labelsize=y_labelsize)
        return subax

使用函数

    fig = plt.figure(figsize=(10,10)) 
    axis1 = fig.add_subplot(2,2,1)
    axis1.plot(1,1,"bo")
    axis2 = fig.add_subplot(2,2,2)
    axis2.plot(1,1,"bo")
    axis3 = fig.add_subplot(2,2,3)
    axis3.plot(1,1,"bo")
    axis4 = fig.add_subplot(2,2,4)
    axis4.set_xlim(-np.pi,np.pi)
    axis4.set_ylim(-1,3)
    x = np.linspace(-np.pi,np.pi)
    axis4.plot(x,np.sin(x))
    subax = add_subplot_axes(axis4,[0.2,0.6,0.3,0.3])
    subax.plot(x,np.sin(x))
    plt.draw()

泡泡图

    x = []
    y = []
    color = []
    size=[]
    area = []
    for i in c4:
        if i[1]>10 and i[0]!="Finance":
            x.append(i[-1])
            y.append(i[-2])
            size.append(i[2])
            area.append(i[1])
            color.append("c")
            text(i[-1],i[-2],i[0],size=9,family="Arial",horizontalalignment='center')
    sct = scatter(x, y, c=color, s=size, linewidths=2, edgecolor='w')
    sct.set_alpha(0.5)
    axis([0.52,0.7,1.03,1.16])
    xlabel(r'$\gamma$',size=18)
    ylabel(r'$\theta$',size=18)

三维图

    import matplotlib.pyplot as plt
    from mpl_toolkits.mplot3d import Axes3D
 
    def randrange(n, vmin, vmax):
        return (vmax-vmin)*np.random.rand(n) + vmin
 
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    n = 100
    for c, m, zl, zh in [('r', 'o', -50, -25), ('b', '^', -30, -5)]:
        xs = randrange(n, 23, 32)
        ys = randrange(n, 0, 100)
        zs = randrange(n, zl, zh)
        ax.scatter(xs, ys, zs, c=c, marker=m)
 
    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    plt.draw()

等高线热图

调用模块

    import matplotlib.pyplot as plt
    from matplotlib.colors import BoundaryNorm
    import numpy as np

绘图

    # make these smaller to increase the resolution
    dx, dy = 0.05, 0.05
    # generate 2 2d grids for the x & y bounds
    y, x = np.mgrid[slice(-2, 2 + dy, dy),
                    slice(-2, 2 + dx, dx)]
    #z = np.sin(x) ** 10 + np.cos(10 + y * x) * np.cos(x)
    z = x**3-3*x+y**3-3*y
    # x and y are bounds, so z should be the value *inside* those bounds.
    # Therefore, remove the last value from the z array.
    z = z[:-1, :-1]
    levels=np.linspace(z.min(), z.max(),15)
    levels=np.array(map(lambda x:np.round(x,1),levels))
    cmap = plt.get_cmap('RdBu')
    norm = BoundaryNorm(levels, ncolors=cmap.N, clip=True)
    # contours are *point* based plots, so convert our bound into point
    # centers
    plt.contourf(x[:-1, :-1] + dx / 2., y[:-1, :-1] + dy / 2., z, levels=levels, cmap=cmap)
    plt.text(0.5,1.0,"Place text here",family="Arial",size=15)
    plt.text(-1.2,-1,"And also here",family="Arial",size=15)
    plt.colorbar()
    plt.title('contourf with levels')

绘制热度图的两种方式

    x = np.random.rand(1000)
    y = np.random.rand(1000)
    plt.subplot(121)
    heatmap, xedges, yedges = np.histogram2d(x, y, bins=50)
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    plt.imshow(heatmap[:,::-1].T, extent=extent)
    plt.subplot(122) 
    H, xedges, yedges = np.histogram2d(x, y, bins=50)
    plt.pcolor(xedges, yedges, H)

根据二维数据散点绘制contour密度图

    def densitydataPlot(xmin,xmax,ymin,ymax,dx,dy,dataArray):
        Y, X = np.mgrid[slice(xmin, xmax + dy, dy),
                        slice(ymin, ymax + dx, dx)]
        def Zdensityfunction(X,Y,dx,dy):
            nX=X.reshape([X.shape[0]*X.shape[1],1])
            nY=Y.reshape([Y.shape[0]*Y.shape[1],1])
            nXY=np.concatenate((nX,nY), axis=1)
            nZ=[]
            for i in nXY:
                x,y = i
                z = len([j for j in dataArray if j[0]>x-dx and j[0]<x+dx and j[1]>y-dy and j[1]<y+dy])
                nZ.append(z)
            return np.array(nZ).reshape(X.shape)
        Z=Zdensityfunction(X,Y,dx,dy)    
        plt.contourf(X + dx , Y + dy , Z,cmap = plt.cm.bone,origin='lower')
        plt.colorbar()

根据二维数据散点绘制平滑过渡密度图

    import scipy.interpolate
    def densityPlot(x,y):
        rx = (x.max()-x.min())/20
        ry = (y.max()-y.min())/20
        z=[]
        for x0,y0 in zip(x,y):
            n=0
            for x1,y1 in zip(x,y):
                if np.abs(x1-x0)<=rx and np.abs(y1-y0)<=ry:
                    n+=1
            z.append(n)
        z=np.array(z)
        # Set up a regular grid of interpolation points
        xi, yi = np.linspace(x.min(), x.max(), 100), np.linspace(y.min(), y.max(), 100)
        xi, yi = np.meshgrid(xi, yi)
        # Interpolate
        rbf = scipy.interpolate.Rbf(x, y, z, function='linear')
        zi = rbf(xi, yi)
        plt.imshow(zi, vmin=z.min(), vmax=z.max(), origin='lower',cmap = plt.get_cmap('gray'),
                   extent=[x.min(), x.max(), y.min(), y.max()])
        plt.scatter(x, y, c= z,s=5, edgecolors='none',alpha=0.7)
        plt.colorbar()
        plt.show()

热度图加回归线

    def OLSRegressHeatPlot(x,y,xlab,ylab,resolution):
        heatmap, xedges, yedges = np.histogram2d(x, y, bins=resolution)
        extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
        xx = sm.add_constant(x, prepend=True)
        res = sm.OLS(y,xx).fit()
        constant, beta = res.params
        #plt.plot(x,y,".",color="m")
        plt.imshow(heatmap[:,::-1].T, extent=extent)
        plt.plot(x,constant+x*beta,color = "r")
        plt.axis(extent)
        plt.xlabel(xlab,size=15)
        plt.ylabel(ylab,size=15)

判断一个字符串s是否完全由数字组成

    s.isdigit()

显示2位小数的便捷方法

    a=13.946
    print("%.2f" % a)

抓取Alexa的上流

    def getEdgelistofASite(aName):
        #def clean(st): return st.string.encode('ascii', 'ignore')
        #def cleanNum(nu): return nu[:-1].encode('ascii', 'ignore')
        url = "http://www.alexa.com/siteinfo/" + aName
        html = urllib.urlopen(url).read()
        soup = BeautifulSoup(html)
        a=soup.select("#upstream-content")[0] # select by id
        c=a.select("a")
        sites=[]
        for i in c:
            sites.append(i.string)
        b=a.select('td[class=text-right]')
        traffic=[]
        for i in b:
            traffic.append(i.select("span")[0].string)
        data = np.array(zip(sites[:10],traffic[:10]))
        return data

随机游走

    import numpy as np
    import pylab as plt
 
    def RandomWalk(N=100, d=2):
        return np.cumsum(np.random.normal(0,1,(N,d)),axis=0)
 
    def PlotRandomWalkXT(N=100):
        X = RandomWalk(N,1)
        plt.plot(X,'k-')
        plt.xlabel('Time',size=14)
        plt.ylabel('Y',size=14)
        plt.title('1D random walk')
 
    def PlotRandomWalkXY(N=100):
        walk = RandomWalk(N)
        X, Y = walk.T
        plt.plot(X,Y,'k-')
        plt.axis('equal')
        plt.xlabel('X',size=14)
        plt.ylabel('Y',size=14)
        plt.title('2D random walk')
 
    def Endpoints(W=10000, N=10, d=2):
        return sum(np.random.normal(0,1,(N,W,d)))
 
    def PlotEndpoints(W=10000, N=10, d=2):
        X, Y = Endpoints(W, N, d).T
        plt.plot(X,Y,'k.')
        plt.axis('equal')
 
    def HistogramRandomWalk(W=10000, N=10, d=1, bins=50):
        X = Endpoints(W, N, d)[:,0]
        plt.hist(X, bins=bins, normed=1,color='gray',alpha=0.5)
        sigma = np.sqrt(N)
        x = np.arange(-3*sigma,3*sigma,sigma/bins)
        rho = (1/(np.sqrt(2*np.pi)*sigma))*np.exp(-x**2/(2*sigma**2))
        plt.plot(x, rho, "r-",linewidth=1)
        plt.xlabel('Distance',size=14)
        plt.ylabel('Probability',size=14)
        plt.title('Distance distribution N=10 D=2')
 
    def ScalingPlot(L):
        data=[[i,np.std(Endpoints(W=1000, N=i, d=2)[:,0])] for i in np.linspace(1,L,100)]
        l,s = np.array(data).T
        plt.plot(M,S,'k.')
        plt.plot(M,M**0.5,'r-')
        plt.ylabel(r'\sigma',size=14)
        plt.xlabel('N',size=14)
        plt.title('Scaling')
 
    fig = plt.figure(figsize=(8, 8),facecolor='white')
    ax = fig.add_subplot(2,2,1)
    PlotRandomWalkXT(10000)
    ax = fig.add_subplot(2,2,2)
    PlotRandomWalkXY(10000)
    ax = fig.add_subplot(2,2,3)
    HistogramRandomWalk(N=10)
    ax = fig.add_subplot(2,2,4)
    ScalingPlot(1000)
    plt.tight_layout()
    plt.show()

Rw lingfei.png

快速对一个list对象t进行倒序排列

    t[::-1]

对大数据的处理示例:数据切割

可以推广成频率统计等其他函数

#########################################
 
def Print(Name,data):
    with open("E:/wulingfei/SRCDataDownload/test/" + Name,'wb') as p:
        for i in data:
            p.write(str(i) + "\n")
 
 
def main():    
    Name = ''
    data=[]
    with open('E:/wulingfei/SRCDataDownload/test/20130227','rb') as f: 
        for line in f:
            bar, hour, a, b =  line.strip().split('\t')
            record = hour + "\t" +a +"\t" +b
            if bar == Name: 
                data.append(record)
            else:   
                if Name:   
                    Print(Name, data)
                Name = bar
                data = []
        Print(Name, data)
 
 
#########################################
 
if __name__=="__main__":
    main()

绘制xkcd风格的图

只在matplotlib 1.3.1及以后有,如果是windows,要到这里下载安装包。

    import matplotlib.pyplot as plt
    plt.xkcd()

然后使用正常的plt.plot等命令绘制出来的图就都有xkcd风格了

版权申明

本页代码采用GNU GENERAL PUBLIC LICENSE版权协议。代码编写与维护人员(按加入时间顺序)包括:

计算士 wlf850927@163.com 老鱼 yudejun2000@gmail.com 黠之大者 wangchj04@gmail.com

相关wiki

参考文献

  1. Barabasi 2016 Network Science. Cambridge
个人工具
名字空间
操作
导航
工具箱