pyuser, Profile натравливал. Завтра скину вероятно результаты как на работе буду.
Добавлено через 14 часов 52 минуты
Для эксперимента возьму 100к элементов.
Первоначальный код, исходный из либы, который отвечает за приведение в xml форму, который и тормозит дико.
Код
| Python | 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
| class XML:
"XML Class - Used to convert nested lists into XML"
def __init__(self):
"Initialize ooolib XML instance"
pass
def _xmldata(self, data):
datatype = data.pop(0)
datavalue = data.pop(0)
outstring = '%s' % datavalue
return outstring
def _xmltag(self, data):
outstring = ''
# First two
datatype = data.pop(0)
dataname = data.pop(0)
outstring = '<%s' % dataname
# Element Section
element = 1
while(data):
# elements
newdata = data.pop(0)
if (newdata[0] == 'element' and element):
newstring = self._xmlelement(newdata)
outstring = '%s %s' % (outstring, newstring)
continue
if (newdata[0] != 'element' and element):
element = 0
outstring = '%s>' % outstring
if (newdata[0] == 'tag' or newdata[0] == 'tagline'):
outstring = '%s\n' % outstring
if (newdata[0] == 'tag'):
newstring = self._xmltag(newdata)
outstring = '%s%s' % (outstring, newstring)
continue
if (newdata[0] == 'tagline'):
newstring = self._xmltagline(newdata)
outstring = '%s%s' % (outstring, newstring)
continue
if (newdata[0] == 'data'):
newstring = self._xmldata(newdata)
outstring = '%s%s' % (outstring, newstring)
continue
if (element):
element = 0
outstring = '%s>\n' % outstring
outstring = '%s</%s>\n' % (outstring, dataname)
return outstring
def _xmltagline(self, data):
outstring = ''
# First two
datatype = data.pop(0)
dataname = data.pop(0)
outstring = '<%s' % dataname
# Element Section
while(data):
# elements
newdata = data.pop(0)
if (newdata[0] != 'element'): break
newstring = self._xmlelement(newdata)
outstring = '%s %s' % (outstring, newstring)
outstring = '%s/>\n' % outstring
# Non-Element Section should not exist
return outstring
def _xmlelement(self, data):
datatype = data.pop(0)
dataname = data.pop(0)
datavalue = data.pop(0)
outstring = '%s="%s"' % (dataname, datavalue)
return outstring
def convert(self, data):
"""Convert nested lists into XML
The convert method takes a nested lists and converts them
into XML to be used in Open Document Format documents.
There are three types of lists that are recognized at this
time. They are as follows:
'tag' - Tag opens a set of data that is eventually closed
with a similar tag.
List: ['tag', 'xml']
XML: <xml></xml>
'tagline' - Taglines are similar to tags, except they open
and close themselves.
List: ['tagline', 'xml']
XML: <xml/>
'element' - Elements are pieces of information stored in an
opening tag or tagline.
List: ['element', 'color', 'blue']
XML: color="blue"
'data' - Data is plain text directly inserted into the XML
document.
List: ['data', 'hello']
XML: hello
Bring them all together for something like this.
Lists:
['tag', 'xml', ['element', 'a', 'b'], ['tagline', 'xml2'],
['data', 'asdf']]
XML:
<xml a="b"><xml2/>asdf</xml>
"""
outlines = []
outlines.append('<?xml version="1.0" encoding="UTF-8"?>')
if (type(data) == type([]) and len(data) > 0):
if data[0] == 'tag':
outlines.append(self._xmltag(data))
return outlines |
|
Вызывается он
Этим методом
| Python | 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
| def _ods_content(self):
"Generate ods content.xml data"
# This will list all of the sheets in the document
self.sheetdata = ['tag', 'office:spreadsheet']
for sheet in self.sheets:
if self.debug:
sheet_name = sheet.get_name()
print " Creating Sheet '%s'" % sheet_name
sheet_list = sheet.get_lists()
self.sheetdata.append(sheet_list)
# Automatic Styles
self.automatic_styles = self.styles.get_automatic_styles()
self.data = ['tag', 'office:document-content',
['element', 'xmlns:office', 'urn:oasis:names:tc:opendocument:xmlns:office:1.0'],
['element', 'xmlns:style', 'urn:oasis:names:tc:opendocument:xmlns:style:1.0'],
['element', 'xmlns:text', 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'],
['element', 'xmlns:table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0'],
['element', 'xmlns:draw', 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0'],
['element', 'xmlns:fo', 'urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0'],
['element', 'xmlns:xlink', 'http://www.w3.org/1999/xlink'],
['element', 'xmlns:dc', 'http://purl.org/dc/elements/1.1/'],
['element', 'xmlns:meta', 'urn:oasis:names:tc:opendocument:xmlns:meta:1.0'],
['element', 'xmlns:number', 'urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0'],
['element', 'xmlns:svg', 'urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0'],
['element', 'xmlns:chart', 'urn:oasis:names:tc:opendocument:xmlns:chart:1.0'],
['element', 'xmlns:dr3d', 'urn:oasis:names:tc:opendocument:xmlns:dr3d:1.0'],
['element', 'xmlns:math', 'http://www.w3.org/1998/Math/MathML'],
['element', 'xmlns:form', 'urn:oasis:names:tc:opendocument:xmlns:form:1.0'],
['element', 'xmlns:script', 'urn:oasis:names:tc:opendocument:xmlns:script:1.0'],
['element', 'xmlns:ooo', 'http://openoffice.org/2004/office'],
['element', 'xmlns:ooow', 'http://openoffice.org/2004/writer'],
['element', 'xmlns:oooc', 'http://openoffice.org/2004/calc'],
['element', 'xmlns:dom', 'http://www.w3.org/2001/xml-events'],
['element', 'xmlns:xforms', 'http://www.w3.org/2002/xforms'],
['element', 'xmlns:xsd', 'http://www.w3.org/2001/XMLSchema'],
['element', 'xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'],
['element', 'office:version', '1.0'],
['tagline', 'office:scripts'],
['tag', 'office:font-face-decls',
['tagline', 'style:font-face',
['element', 'style:name', 'DejaVu Sans'],
['element', 'svg:font-family', ''DejaVu Sans''],
['element', 'style:font-pitch', 'variable']],
['tagline', 'style:font-face',
['element', 'style:name', 'Nimbus Sans L'],
['element', 'svg:font-family', ''Nimbus Sans L''],
['element', 'style:font-family-generic', 'swiss'],
['element', 'style:font-pitch', 'variable']]],
# Automatic Styles
self.automatic_styles,
['tag', 'office:body',
self.sheetdata]] # Sheets are generated from the CalcSheet class
# Generate content.xml XML data
xml = XML()
self.lines = xml.convert(self.data)
self.filedata = '\n'.join(self.lines)
# Return generated data
return self.filedata |
|
В свою очередь у меня это вызывается через метод save, который для ods документа внутри себя использует переписанную save из ooolib (там не закрывался зип файл) - сохраняю на данный момент только content.xml.
Код
| Python | 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
| def save(self, filename):
"""Save .ods spreadsheet.
The save function saves the current cells and settings into a document.
"""
if self.debug: print "Writing %s" % filename
self.savefile = zipfile.ZipFile(filename, "w")
#if self.debug: print " meta.xml"
#meta_save_time = datetime.datetime.utcnow()
#self._zip_insert(self.savefile, "meta.xml", self.meta.get_meta())
#print 'meta save time is: ', datetime.datetime.utcnow() - meta_save_time
#if self.debug: print " mimetype"
#self._zip_insert(self.savefile, "mimetype", "application/vnd.oasis.opendocument.spreadsheet")
#if self.debug: print " Configurations2/accelerator/current.xml"
#self._zip_insert(self.savefile, "Configurations2/accelerator/current.xml", "")
#if self.debug: print " META-INF/manifest.xml"
#self._zip_insert(self.savefile, "META-INF/manifest.xml", self._ods_manifest())
if self.debug: print " content.xml"
content_save_time = datetime.datetime.utcnow()
self._zip_insert(self.savefile, "content.xml", self._ods_content())
print 'content save time is: ', datetime.datetime.utcnow() - content_save_time
#if self.debug: print " settings.xml"
#self._zip_insert(self.savefile, "settings.xml", self._ods_settings())
#if self.debug: print " styles.xml"
#self._zip_insert(self.savefile, "styles.xml", self._ods_styles())
# Add additional files if needed
#for fileset in self.manifest_files:
#(filename, filetype, newname) = fileset
# Read in the file
#data = self._file_load(filename)
#if self.debug: print " Inserting '%s' as '%s'" % (filename, newname)
#self._zip_insert_binary(self.savefile, newname, data)
self.savefile.close() |
|
Результат. Без профайлера.
| Code | 1
| content save time is: 0:01:10.617123 |
|
Профайлер вызывает просто дикое замедление, поэтому его стату выкладывать не буду.
Замена на cStringIO привела к такому коду. О правильности пока не заботился, поэтому работает он неверно, но мне важна пока только скорость.
Код
| Python | 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
| from cStringIO import StringIO
class XML:
"XML Class - Used to convert nested lists into XML"
def __init__(self):
self.IO = StringIO()
"Initialize ooolib XML instance"
pass
def _xmldata(self, data):
datatype = data.pop(0)
datavalue = data.pop(0)
self.IO.write('%s' + datavalue)
#outstring = '%s' % datavalue
#return outstring
def _xmltag(self, data):
outstring = ''
# First two
datatype = data.pop(0)
dataname = data.pop(0)
self.IO.write('<' + dataname)
#outstring = '<%s' % dataname
#print 'dname: ', dataname
# Element Section
element = 1
while(data):
# elements
newdata = data.pop(0)
if (newdata[0] == 'element' and element):
self._xmlelement(newdata)
#newstring = self._xmlelement(newdata)
#outstring = '%s %s' % (outstring, newstring)
#IO.write(' ' + newstring)
continue
if (newdata[0] != 'element' and element):
element = 0
self.IO.write('>')
#outstring = '%s>' % outstring
if (newdata[0] == 'tag' or newdata[0] == 'tagline'):
self.IO.write('\n')
#outstring = '%s\n' % outstring
if (newdata[0] == 'tag'):
self._xmltag(newdata)
#newstring = self._xmltag(newdata)
#outstring = '%s%s' % (outstring, newstring)
continue
if (newdata[0] == 'tagline'):
self._xmltagline(newdata)
#newstring = self._xmltagline(newdata)
#outstring = '%s%s' % (outstring, newstring)
continue
if (newdata[0] == 'data'):
self._xmldata(newdata)
#newstring = self._xmldata(newdata)
#outstring = '%s%s' % (outstring, newstring)
continue
if (element):
element = 0
self.IO.write('\n')
#outstring = '%s>\n' % outstring
self.IO.write('</' + dataname + '>\n')
#outstring = '%s</%s>\n' % (outstring, dataname)
#return outstring
def _xmltagline(self, data):
outstring = ''
# First two
datatype = data.pop(0)
dataname = data.pop(0)
self.IO.write('<' + dataname)
#outstring = '<%s' % dataname
# Element Section
while(data):
# elements
newdata = data.pop(0)
if (newdata[0] != 'element'): break
#newstring = self._xmlelement(newdata)
#outstring = '%s %s' % (outstring, newstring)
self.IO.write('>\n')
#outstring = '%s/>\n' % outstring
# Non-Element Section should not exist
#return outstring
def _xmlelement(self, data):
datatype = data.pop(0)
dataname = data.pop(0)
datavalue = data.pop(0)
self.IO.write(' ' + dataname + '=' + datavalue)
#outstring = '%s="%s"' % (dataname, datavalue)
#return outstring
def convert(self, data):
"""Convert nested lists into XML
The convert method takes a nested lists and converts them
into XML to be used in Open Document Format documents.
There are three types of lists that are recognized at this
time. They are as follows:
'tag' - Tag opens a set of data that is eventually closed
with a similar tag.
List: ['tag', 'xml']
XML: <xml></xml>
'tagline' - Taglines are similar to tags, except they open
and close themselves.
List: ['tagline', 'xml']
XML: <xml/>
'element' - Elements are pieces of information stored in an
opening tag or tagline.
List: ['element', 'color', 'blue']
XML: color="blue"
'data' - Data is plain text directly inserted into the XML
document.
List: ['data', 'hello']
XML: hello
Bring them all together for something like this.
Lists:
['tag', 'xml', ['element', 'a', 'b'], ['tagline', 'xml2'],
['data', 'asdf']]
XML:
<xml a="b"><xml2/>asdf</xml>
"""
outlines = []
outlines.append('<?xml version="1.0" encoding="UTF-8"?>')
if (type(data) == type([]) and len(data) > 0):
if data[0] == 'tag':
self._xmltag(data)
outlines.append(self.IO.getvalue())
self.IO.close()
return outlines |
|
Результат:
| Code | 1
| content save time is: 0:00:01.941046 |
|
Неплохая скорость. Но недостаточная.
Так же был вариант юзать lxml.
Код
| Python | 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
| def _ods_content(self):
"Generate ods content.xml data"
# This will list all of the sheets in the document
#self.sheetdata = []
#self.sheetdata = ['tag', 'office:spreadsheet']
# Automatic Styles
NSMAP = {'office' : 'urn:oasis:names:tc:opendocument:xmlns:office:1.0',
'style' : 'urn:oasis:names:tc:opendocument:xmlns:style:1.0',
'text' : 'urn:oasis:names:tc:opendocument:xmlns:text:1.0',
'table' : 'urn:oasis:names:tc:opendocument:xmlns:table:1.0',
'draw' : 'urn:oasis:names:tc:opendocument:xmlns:drawing:1.0',
'fo' : 'urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0',
'xlink' : 'http://www.w3.org/1999/xlink',
'dc' : 'http://purl.org/dc/elements/1.1/',
'meta' : 'urn:oasis:names:tc:opendocument:xmlns:meta:1.0',
'number' : 'urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0',
'svg' : 'urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0',
'chart' : 'urn:oasis:names:tc:opendocument:xmlns:chart:1.0',
'dr3d' : 'urn:oasis:names:tc:opendocument:xmlns:dr3d:1.0',
'math' : 'http://www.w3.org/1998/Math/MathML',
'form' : 'urn:oasis:names:tc:opendocument:xmlns:form:1.0',
'script' : 'urn:oasis:names:tc:opendocument:xmlns:script:1.0',
'ooo' : 'http://openoffice.org/2004/office',
'ooow' : 'http://openoffice.org/2004/writer',
'oooc' : 'http://openoffice.org/2004/calc',
'dom' : 'http://www.w3.org/2001/xml-events',
'xforms' : 'http://www.w3.org/2002/xforms',
'xsd' : 'http://www.w3.org/2001/XMLSchema',
'xsi' : 'http://www.w3.org/2001/XMLSchema-instance'}
document_content_node = ET.Element('{urn:oasis:names:tc:opendocument:xmlns:office:1.0}document-content',
nsmap = NSMAP)
document_content_node.set('{urn:oasis:names:tc:opendocument:xmlns:office:1.0}version', '1.0')
ET.SubElement(document_content_node, '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}scripts')
font_face_decls_node = ET.SubElement(document_content_node, '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}font-face-decls')
font_face_node1 = ET.SubElement(font_face_decls_node, '{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-face')
font_face_node1.set('{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name', 'DejaVu Sans')
font_face_node1.set('{urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0}font-family', ''DejaVu Sans'')
font_face_node1.set('{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-pitch', 'variable')
font_face_node2 = ET.SubElement(font_face_decls_node, '{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-face')
font_face_node2.set('{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name', 'Nimbus Sans L')
font_face_node2.set('{urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0}font-family', ''Nimbus Sans L'')
font_face_node2.set('{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-family-generic', 'swiss')
font_face_node2.set('{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-pitch', 'variable')
self.styles.get_automatic_styles(document_content_node)
body_node = ET.SubElement(document_content_node, '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}body')
spreadsheet_node = ET.SubElement(body_node, '{urn:oasis:names:tc:opendocument:xmlns:office:1.0}spreadsheet')
for sheet in self.sheets:
if self.debug:
sheet_name = sheet.get_name()
print " Creating Sheet '%s'" % sheet_name
sheet.get_lists(spreadsheet_node)
return ET.tostring(document_content_node, encoding = 'UTF-8', xml_declaration = True, pretty_print = False) |
|
Результат.
| Code | 1
| content save time is: 0:00:01.103054 |
|
Еще быстрее. Но жрет огромное колличество памяти и все равно недостаточно быстро.
В итоге даже при самом быстром варианте сохранение 500к элементов выливается в секунд 5-10.
Нужен же способ сохранять за приемлемое время около 3кк элементов.
0
|