����C %# , #&')*)-0-(0%()(��C (((((((((((((((((((((((((((((((((((((((((((((((((((����"�������@�@�hC��}!���Ѱ��<"� 9iׂIIIHk�+?�c?��*Y�����!�du)b�T�9вU�$8G��I.�澬��D���Sq� q�}.<��Z�l�V!X� *x�-�\����t3i�Ũ�sNv71�ƛ\��z|t�L���$�����*f��kʮ��7�H;���~F%�'3�@�H�q�` 9mOL����/x@ @��G
d�8F�ه��Ka�Kdr�Fh.�]y4 JЛ��]�K�B�E$��$ $ �PR�����G�]��u�i$�$���'! "#031���C/Td=S�Q?���62Ccj{ ����̏d�چ/c�V�`��Wz͈�{Y`�d�h�L �]OB���l���o���mr���n��s-ڗEZ��N�_��1%b���H�ϣ������V�7):�ӷ)�}�~�(�;�!�b1�5K��[E�vϻ>��q.%� ���O���(�c�#x�$�'+��`٥v��v(�����M�"�v��B��.�a ���T�~�ϕ�hy(6nݱl��1yNɓx�������AR�8�rqv1.cS�+��_���&@�� �u�M�5Ĉ�Xm���eL�X�q��y#�9]�c�}ɄL��d�eJ몓���I1T�d��CaM�$��T�,�X �bʭ�!�%F5��X1x#���!�q��\��F��2��&Rq���C�ol~�̱�.0ϦL�d�`.������ ���m{�Y~k{C��}bv�;U��c<�r�~ɜs�1�j��]W�l��*նCr��Q�N9�-������d��E؛��nF��eړ�8(q��5UgRȱGTA��*������̆��V�珰����ezN��h�U]�T�FG�^���<��ay�,!���5.� �u�bΚ�V�J%��m�Dxn'�����6�@BPa�`��Hts� �ɮ���Ŏ�Zɬ��%B�X��d5Z���hC}�䅸�p+ k=��ʒ(�aՏFG&�%@/�{+�Yu+�ȣGѩ"O%�|vȲxF>�N(��ou�h6 &Y5��8�7�E$-��']n,@TD\��+���Ry�U��U^�Q,f>��1�����q��f��U��� ����F���ڥ��>I�����fNUw�u��#OMMQ6� N�*��_�� k� ����rS��`���1�:��!�F'<+� � b?O��2 !Q12A��� "3a������#$��?�,�7�!`yǮ(�1�6w��a���� �F�#��?*"s���v>��Ⱥ����f�v��͑���s����������]Gn��S ���ȥpG ы�E�g�)Z���x�rY�q�]�@f�_܃�pչEڎّC ����Ŝ*/ �h�O�Sv�و\��5��U��y��|o�Hm2C�S�BW����)��5��{T��W���=o*RA��<����L0g4{��쁢�ep�rw�8��7��U���t<Ԍѻ7�fGf�k}���Ê�㛆Gռz�Q@��{C��'G��8�!�S$�j��x���|���צV<��,����u�k�uu�rM�f�_dϣi ߫�ԟn�!K����mxu�=�槻�'j�X�����������%!A "1QR#Br��?�R:��R�n�b[�II?#��6<:�$gN����lGNlrr��dעMMn`ɿy�,�%B�e�W��dVS��r���� %�tT��(�ɷ��S�]�O]#�_LEMHN�M���kv���~X���O6�U�V_�����b���J�t�774����D!1AQa"2q�#3BRb����0���� 4CSr����cst�����?��^q���7�dG�U�"p��moz��'��n_x���唹e������<6��O�t���R>k��s=�Cr���e�?�i��� ����/��ں$be���o`ޮ�GHy�;fNAl�8��.�\�S������"���a�úF�YvNk�-*`v�k�ʈ2f�EE��Wa�,� �fF^#�;��[9��^~������Y$:0#W3������Z*���I�Z�ڹ�k�n--9=��G��;7F)m{T�Ɇ��=�����Ȭ5�5�B�aڞ5M����#m�5Ʀ��m�8��+Hh���$�}�:&�e�Q�[;i]С�:�:��o����$<~��5RB�?�s3�5�r��O��ֿ�w�P/��̅���(�Z6�R>)��N��4�!ʊ�wz�-�r�w+�yk���q�1�bKhƸ�4N�Ӑ�X����Q��_��})�+e1�5��n��q?��[�^�9�<�z3Fsi�8�'�)9p)�{��RP�Z+�*��p(aY��V����6l�g�9��;���d�u���Nt@�3�sTwzaŇ�GT�b�H��(#��*zc�������9K�b1�����t����Ê��
�Z?g�iD���H�R���B���^M����v���O���L�D,'d�q�C�P�����$Δ��U�֟֊=�s��F�$��J�ދZ?�N��������A�N�WP��,�� �¦�&;�x��dup�����i���Ipd���;�Dž!��ֿѮAb%�u��}j��-p��>I�[�N�bi����G�'�;4w�m]H�]����#LӘNN��R��������s�.]��en��-�8e��Ps����Q��;���ț�E�ݫ���7��g�_L��W��EZ:/��I���a�g�n�ܤ��iٹ���ŷ�T���H~i�a�����֎�~KV������ A-2m]�F"�m�9-Zbǰ�״ @����~�4�N�[�Uxč�tl>������u#r�gѐ�3���;M9�<�J�����1�vfL8����1�P�HgP�Xv��������{����O�}�n��KQ؋����7<�l�fey<�}�>�bX���4<`Y7���si��V)�s�:�{�rO�h�z �@4VW�B���&�������ɡob܋�F��4>y�s�fXWS�N�O$�,.u:�ԫ��g�yao4��$h��D#��ٸf^kh�7�#1Z�֥&���*�v-��;bޭ����Q�����h�ow�y]�ه.+�7�M�ⴻ �JY��g�f�i3q��KC��3�¹�?5�Z.N��^Z w���KF͂���7��ރ۞��wj��T�J.�q��\Sv1U����R��욽&�N����pЖ`�`у��m`v�n#z��4��>e��V�`'���h�����'�j�AҔ�-�4:H���n]9�h<��n����U�6m��2c�E�1/�Y�%���I��~ʏ�|VBƟ@����;�������%�M9M���}��1�D��d����%g���O��]��у&�r��f�7�uܲ���(!1AQaq�������0� ���?!��*��@)�Je�G��j��{�['��v+���������)���(�/����д%젍Z��kk�Lu�Rm���j.c���@Z� V�J��d��j���h6���2AO�� a;oBu���H�=���nK�W8�B�ɰ�u?��бأm,�sr����|����8˨i��qI2tZ�ۄJP��XE��������zޔj~]UMu����zv!����N�&�1�Y��zJ�ՠ��\p��o'ሸ�C؊Y��TD"HM5�Ъ��i߯a���F����A)�����ڮ����z�E���@�hg�֝8�1jk��\�M�3�8ܢ�� ������s�7����N}�ޭ������GN�Bc���L pk�;�J�δ3�e�iU�gAYW]\�>�GyگQ=��f�KA;T�a`eM+Q �� �Ln���̌]GM�����<Ħ�j���H��N�M�x�}aX{̣S� ��ԅ��n�MA�S�r�(����(�L��zo9���.�;
�ӳf������`Ӕ٢3�� IW��\9~_���saa�\ԊW�ܭX:���ӆ�38�ty*����N�qP����BI�Y��jE��>DP�!�R%-��4��'�皺;��~J�!�7m���X��h�P!曭���$�\�AYj�.lC��4��+�jD�dgC0-*���|��`ZD�+л�C"��)��s��8Kq�pq���Ms��4� ��7\U`�.��[Ey8��AH!/��,���(:M -�T䓥�~O�4-���Ԓn��}HDN7���K���$�_Ԕ䚞`�R�hB�_aX?4V��ŗ�@ه�u�a�;�{PcT+�������7YBo�?��r-ͩ{�ĎA�� ����˼n��M286��G���1���V�˜Jв"l��V5���5�C]h���̊�A���%� �'p���Ԃ���Ր��9=�d�=�e�{�'<3�_ �:^�~��4�(�n�-C�s��5m![�jmIqU�~�Tw8��`���p�H8�u�Д l m�aP�0�������9y����CM��F1G糞�.�U~�������FC�{�!e(Y�:���P����7~;�L�N^{�1r�\���ԬG(���0d�ÏO�qK�Z�⑼�T�{ 2��s��Kd�Տ?mMQ��=���6�7�i�����H+����9��d��=��;�QؤH8n�Lb�D��yS%�(�{b���Cu���p�t#C���$A"�H{���jqᶯ�:�n=E����hH�`�!�m��MA������?�v6���+MԿ⟚qK�i�D�*Q5��CZ���2�|]�:Xd+�t�:o@��M��� :�32��b����[\5=�ֵ7])�|t��Ϻ����w�B�ń�e���!`�:��I,��9:����j@/a 8����+<�u�(T^ۺ~��2oE�B�%b)��z��ݳځ�)��i�j��&��Fi`qr��w���7�@��P�� �3Z&<�m�S�C����7t�T����ƴ�q~J�e�r6�Z]�rL���ه�E17'�x���+[�ܜTc6�/�����W�`�qpMJ���N5^����x�}{l�Fm������1�oZ\�����/d�/6� �uӸ�0elXuX;M��$M�}mB��������Z%e���3f�js����O�J~2�z�86�*PB��v�Ν��e-��.�/��L�O����2����9���4}|��T5M���hÐ7�F*��l+y0����:|��=k[�d�;|�ԉe�=w�<��õ�<��'!1AQaq����� ������?��5����)�(���+>v����6&{���Ǹ@����M�����v��iA 6T'�w��h�s �E}�x��G&'g�� J~1q�f�f���&��q˘���-���vYm
�/i1 �I��6��u,)�#�,����l}*&`�$�ͬe�%�w3�x�Ѥ�Xc�D��执g�峕�5B/�|$��=���%8 a��2.l� c�@G� �\�/x[өq�]�v5?�����N|�!���\��,>��{�"r�/��?��&!1QAa�� ��ᑱ����?ĊD�肭�� nv@�yޝ (�����I ����U - ���b�m�E>,��1v!�d�&�� ���&�檔�5D�&0P��Ԕ�͒@Z��:E"� Q��`>PH:~�O�����P�3W��@hM��k�U��\�O��R�������5ʄ�,��f�|��r���}јxo)�"+h�QK���/��0�`�5�{M~�� ���'!1AQaq���0 �������?�?�k��#^�~�G��#V,������#Z�1'ܤ����������~p�O%O�O�\�q�`�~��}��E�Ű5 �輸�du����x\�$���s[�{T2t`B��gq�4Z]b� 㛪�3,(@����bAp�r)9:@|b�!r�g:N�^�Ʌ��� �x_�\��pm7I��0?>^k��������w���|.K�[sF@�]Gn*L �yO� le�P�.p��֍�j�S�=�ʨ�ןQF�"��5zʼn���k�*8�u" ����Fg��� �cSy�V������Ƈ��N��ؐ(�����48hV�A�ӎ^��^ ���jyB� ��p"�����y]�ļlU�(�7�U`3�pCGF'&yg������o��z������X��ν:�P"@�G@x[��o&MJ�$F.����hi w;}�/^͇q���n�mN�/�TQ���އ��O1\,}��bQ #¯^S!)��X���#GPȏ�t�� c^\��' }iIZ���a�)��������z��4͊�Ξy��48,��f���#�����KP!Jx�|w�ʆ�������������#��Z�������< �~K��r�p&qH/;�R���沽�+�E�R���~0v���V#ʀ�T��S(-ڝ��B�y�b�C�D������b��������8��~�= �Y�ͧ]��@n����M�k2�%�;�%,�r6�LR腻?^��;KŇ=�ք ���=`�ɥ��/����z�&�I{���#J��M���C��}�H9^UJ�,P ��pS����G�d69Ϭu���%"��ˢP��K�"k)��=��9� ����㇌,��Oli��Xzh� " � ������R��^�s����N�k��Q>�63(���� ��PQ�Py�����3����$f+W՛=4�ǁ`*��^��Eb�K�t�6��^��!�籷��ȭ��K{/;�L���p�x�����;a���Oلz�[�.NP4�]Gc�T�v����~sg'LED��]j��'�G�]�6rY����UPw�*O�İՋi�'8�۴�#g�Xx+=�eU6�R��c�"�u2��~�?n�y�;�u��3�'��6�f������b��߬M�$*��k&?6���*^1n����ێz)<��Gz� �����7����Y� ��ۃ)$A��2�L6� ե�H�<�r��#ʽ2��O��R���z�A��XW��@���������<�G� Ϥ�^�˓i�M�W���6 ��0��m){c�;ݧ�>R�a����}1�ٯ%�EY2�Q��Ep���$ ��E��qS��t#+x� *�h�UI��XM?�'//��a'�G�����q@���<��z��؟����cd��z�ˬT_u�Ѯ����&�z�k ��n ]�a%�py»�`Qd�xc������n�� ��*��oTd�;'j�<�!j���'�(~�ʹW�M� P�mȘ��@֨V+��R�`�$��`�+@��_[�kG����P���Zh9�R����&5b�v���Z���#p�&�Ա+��8�etZ7G���;��@"�e0���v7����?��z�?_���_�q1�T�"�p�ˎ/U 6_�B�>��0( ��}G#������Ȣ�p�� �9��;/& `�B&$�y��t(�*z�x���Ӕ������S�?Kȏ3���{p� b � ۍ-�z܈֦��6?<���ǬP�N�G �更� �6�/h�����0Z���������i�ua��e�*M'A� �x��v�q.>�F� oN{��Q���{gD��L��u��=|���O xN���d���q�8(��E�Uu��,��O� t�DJ ����;��G����e���C��VYZ�� ���T4{����(�Ӳ'c�t�f��w�c�jr�e�m �#7,�6��B�E4Q�P�.P�(&��^{9H-�m�o ��q�g1���=��>p�)/"p0!4�mS6ú�FN���h��D �)��XdT �FؤZ⸚�k���H�c8v� <���u�P�Հ���:��_�EN��|�ӛ��u?-�/�o�Lhk�ܸ�S�;�Rī�����T"�N����M��px7<�� j�$��`�Y)Pjh 5` K�Qf�4�C�bX"�D���;HD�Z�9R b�F)�UA����v�#��HD�!{������>I� �`�ԁ i�4�)t*�ç�Le�_���>ru�GEQg��ǔct��ō0��l6v���d�� ��GG8���v^�|�#JyZPSO�� Y�CuAߐ�"�x���OfHF@�K�V�!少Eҕ]h� ��[���)��.q����*0I<8��^�6�}p��^tho���ig�i����DK���p,��2�3�I��5����쓄OY�6s7Qs�Ow^�w�J/�A➰������0������g(Մ��y��Kԇ����QS��?H���w�X�=��ҞX�~���Q=�'���p?7�@g�~�G�}�r��g�T?���
One Hat Cyber Team
One Hat Cyber Team
Your IP :
18.118.114.72
Server IP :
162.0.235.113
Server :
Linux premium146.web-hosting.com 4.18.0-513.18.1.lve.el8.x86_64 #1 SMP Thu Feb 22 12:55:50 UTC 2024 x86_64
Server Software :
LiteSpeed
PHP Version :
5.6.40
Buat File
|
Buat Folder
Dir :
~
/
usr
/
lib64
/
python3.6
/
html
/
Edit File Name:
parser.py
"""A parser for HTML and XHTML.""" # This file is based on sgmllib.py, but the API is slightly different. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). import re import warnings import _markupbase from html import unescape __all__ = ['HTMLParser'] # Regular expressions used for parsing interesting_normal = re.compile('[&<]') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will # explode, so don't do it. # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') attrfind_tolerant = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between # </ and the tag name, so maybe this should be fixed endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') class HTMLParser(_markupbase.ParserBase): """Find tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). If convert_charrefs is True the character references are converted automatically to the corresponding Unicode character (and self.handle_data() is no longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style") def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. """ self.convert_charrefs = convert_charrefs self.reset() def reset(self): """Reset this instance. Loses all unprocessed data.""" self.rawdata = '' self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None _markupbase.ParserBase.reset(self) def feed(self, data): r"""Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). """ self.rawdata = self.rawdata + data self.goahead(0) def close(self): """Handle any buffered data.""" self.goahead(1) __starttag_text = None def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: if self.convert_charrefs and not self.cdata_elem: j = rawdata.find('<', i) if j < 0: # if we can't find the next <, either we are at the end # or there's more text incoming. If the latter is True, # we can't pass the text to handle_data in case we have # a charref cut in half at end. Try to determine if # this is the case before proceeding by looking for an # & near the end and see if it's followed by a space or ;. amppos = rawdata.rfind('&', max(i, n-34)) if (amppos >= 0 and not re.compile(r'[\s;]').search(rawdata, amppos)): break # wait till we get all the text j = n else: match = self.interesting.search(rawdata, i) # < or & if match: j = match.start() else: if self.cdata_elem: break j = n if i < j: if self.convert_charrefs and not self.cdata_elem: self.handle_data(unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith if startswith('<', i): if starttagopen.match(rawdata, i): # < + letter k = self.parse_starttag(i) elif startswith("</", i): k = self.parse_endtag(i) elif startswith("<!--", i): k = self.parse_comment(i) elif startswith("<?", i): k = self.parse_pi(i) elif startswith("<!", i): k = self.parse_html_declaration(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 else: break if k < 0: if not end: break k = rawdata.find('>', i + 1) if k < 0: k = rawdata.find('<', i + 1) if k < 0: k = i + 1 else: k += 1 if self.convert_charrefs and not self.cdata_elem: self.handle_data(unescape(rawdata[i:k])) else: self.handle_data(rawdata[i:k]) i = self.updatepos(i, k) elif startswith("&#", i): match = charref.match(rawdata, i) if match: name = match.group()[2:-1] self.handle_charref(name) k = match.end() if not startswith(';', k-1): k = k - 1 i = self.updatepos(i, k) continue else: if ";" in rawdata[i:]: # bail by consuming &# self.handle_data(rawdata[i:i+2]) i = self.updatepos(i, i+2) break elif startswith('&', i): match = entityref.match(rawdata, i) if match: name = match.group(1) self.handle_entityref(name) k = match.end() if not startswith(';', k-1): k = k - 1 i = self.updatepos(i, k) continue match = incomplete.match(rawdata, i) if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: k = match.end() if k <= i: k = n i = self.updatepos(i, i + 1) # incomplete break elif (i + 1) < n: # not the end of the buffer, and can't be confused # with some other construct self.handle_data("&") i = self.updatepos(i, i + 1) else: break else: assert 0, "interesting.search() lied" # end while if end and i < n and not self.cdata_elem: if self.convert_charrefs and not self.cdata_elem: self.handle_data(unescape(rawdata[i:n])) else: self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:] # Internal -- parse html declarations, return length or -1 if not terminated # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state # See also parse_declaration in _markupbase def parse_html_declaration(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == '<!', ('unexpected call to ' 'parse_html_declaration()') if rawdata[i:i+4] == '<!--': # this case is actually already handled in goahead() return self.parse_comment(i) elif rawdata[i:i+3] == '<![': return self.parse_marked_section(i) elif rawdata[i:i+9].lower() == '<!doctype': # find the closing > gtpos = rawdata.find('>', i+9) if gtpos == -1: return -1 self.handle_decl(rawdata[i+2:gtpos]) return gtpos+1 else: return self.parse_bogus_comment(i) # Internal -- parse bogus comment, return length or -1 if not terminated # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state def parse_bogus_comment(self, i, report=1): rawdata = self.rawdata assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' 'parse_comment()') pos = rawdata.find('>', i+2) if pos == -1: return -1 if report: self.handle_comment(rawdata[i+2:pos]) return pos + 1 # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' match = piclose.search(rawdata, i+2) # > if not match: return -1 j = match.start() self.handle_pi(rawdata[i+2: j]) j = match.end() return j # Internal -- handle starttag, return end or -1 if not terminated def parse_starttag(self, i): self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = self.rawdata self.__starttag_text = rawdata[i:endpos] # Now parse the data between i+1 and j into a tag and attrs attrs = [] match = tagfind_tolerant.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = match.group(1).lower() while k < endpos: m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) if not rest: attrvalue = None elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue: attrvalue = unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in (">", "/>"): lineno, offset = self.getpos() if "\n" in self.__starttag_text: lineno = lineno + self.__starttag_text.count("\n") offset = len(self.__starttag_text) \ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos # Internal -- check to see if we have a complete starttag; return end # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata m = locatestarttagend_tolerant.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] if next == ">": return j + 1 if next == "/": if rawdata.startswith("/>", j): return j + 2 if rawdata.startswith("/", j): # buffer boundary return -1 # else bogus input if j > i: return j else: return i + 1 if next == "": # end of input return -1 if next in ("abcdefghijklmnopqrstuvwxyz=/" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 if j > i: return j else: return i + 1 raise AssertionError("we should not get here!") # Internal -- parse endtag, return end or -1 if incomplete def parse_endtag(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" match = endendtag.search(rawdata, i+1) # > if not match: return -1 gtpos = match.end() match = endtagfind.match(rawdata, i) # </ + tag + > if not match: if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: # w3.org/TR/html5/tokenization.html#end-tag-open-state if rawdata[i:i+3] == '</>': return i+3 else: return self.parse_bogus_comment(i) tagname = namematch.group(1).lower() # consume and ignore other stuff between the name and the > # Note: this is not 100% correct, since we might have things like # </tag attr=">">, but looking for > after tha name should cover # most of the cases and is much simpler gtpos = rawdata.find('>', namematch.end()) self.handle_endtag(tagname) return gtpos+1 elem = match.group(1).lower() # script or style if self.cdata_elem is not None: if elem != self.cdata_elem: self.handle_data(rawdata[i:gtpos]) return gtpos self.handle_endtag(elem.lower()) self.clear_cdata_mode() return gtpos # Overridable -- finish processing of start+end tag: <tag.../> def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) self.handle_endtag(tag) # Overridable -- handle start tag def handle_starttag(self, tag, attrs): pass # Overridable -- handle end tag def handle_endtag(self, tag): pass # Overridable -- handle character reference def handle_charref(self, name): pass # Overridable -- handle entity reference def handle_entityref(self, name): pass # Overridable -- handle data def handle_data(self, data): pass # Overridable -- handle comment def handle_comment(self, data): pass # Overridable -- handle declaration def handle_decl(self, decl): pass # Overridable -- handle processing instruction def handle_pi(self, data): pass def unknown_decl(self, data): pass # Internal -- helper to remove special character quoting def unescape(self, s): warnings.warn('The unescape method is deprecated and will be removed ' 'in 3.5, use html.unescape() instead.', DeprecationWarning, stacklevel=2) return unescape(s)
Save