1  import difflib 
  2  from lxml import etree 
  3  from lxml.html import fragment_fromstring 
  4  import cgi 
  5  import re 
  6   
  7  __all__ = ['html_annotate', 'htmldiff'] 
  8   
  9   
 10   
 11   
 12   
 13   
 15      return '<span title="%s">%s</span>' % ( 
 16          cgi.escape(unicode(version), 1), text) 
  17   
 19      """ 
 20      doclist should be ordered from oldest to newest, like:: 
 21   
 22          >>> version1 = 'Hello World' 
 23          >>> version2 = 'Goodbye World' 
 24          >>> html_annotate([(version1, 'version 1'), 
 25          ...                (version2, 'version 2')]) 
 26          u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>' 
 27   
 28      The documents must be *fragments* (str/UTF8 or unicode), not 
 29      complete documents 
 30   
 31      The markup argument is a function to markup the spans of words. 
 32      This function is called like markup('Hello', 'version 2'), and 
 33      returns HTML.  The first argument is text and never includes any 
 34      markup.  The default uses a span with a title: 
 35   
 36          >>> default_markup('Some Text', 'by Joe') 
 37          u'<span title="by Joe">Some Text</span>' 
 38      """ 
 39       
 40       
 41       
 42       
 43       
 44      tokenlist = [tokenize_annotated(doc, version) 
 45                   for doc, version in doclist] 
 46      cur_tokens = tokenlist[0] 
 47      for tokens in tokenlist[1:]: 
 48          html_annotate_merge_annotations(cur_tokens, tokens) 
 49          cur_tokens = tokens 
 50   
 51       
 52       
 53      cur_tokens = compress_tokens(cur_tokens) 
 54       
 55      result = markup_serialize_tokens(cur_tokens, markup) 
 56      return ''.join(result).strip() 
  57   
 59      """Tokenize a document and add an annotation attribute to each token 
 60      """ 
 61      tokens = tokenize(doc, include_hrefs=False) 
 62      for tok in tokens:  
 63          tok.annotation = annotation 
 64      return tokens 
 65   
 67      """Merge the annotations from tokens_old into tokens_new, when the 
 68      tokens in the new document already existed in the old document. 
 69      """ 
 70      s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) 
 71      commands = s.get_opcodes() 
 72   
 73      for command, i1, i2, j1, j2 in commands: 
 74          if command == 'equal':  
 75              eq_old = tokens_old[i1:i2] 
 76              eq_new = tokens_new[j1:j2] 
 77              copy_annotations(eq_old, eq_new) 
 78   
 80      """ 
 81      Copy annotations from the tokens listed in src to the tokens in dest 
 82      """ 
 83      assert len(src) == len(dest) 
 84      for src_tok, dest_tok in zip(src, dest):  
 85          dest_tok.annotation = src_tok.annotation 
 86   
 88      """ 
 89      Combine adjacent tokens when there is no HTML between the tokens,  
 90      and they share an annotation 
 91      """ 
 92      result = [tokens[0]]  
 93      for tok in tokens[1:]:  
 94          if (not result[-1].post_tags and  
 95              not tok.pre_tags and  
 96              result[-1].annotation == tok.annotation):  
 97              compress_merge_back(result, tok) 
 98          else:  
 99              result.append(tok) 
100      return result 
 101   
103      """ Merge tok into the last element of tokens (modifying the list of 
104      tokens in-place).  """ 
105      last = tokens[-1] 
106      if type(last) is not token or type(tok) is not token:  
107          tokens.append(tok) 
108      else: 
109          text = unicode(last) 
110          if last.trailing_whitespace: 
111              text += ' ' 
112          text += tok 
113          merged = token(text, 
114                         pre_tags=last.pre_tags, 
115                         post_tags=tok.post_tags, 
116                         trailing_whitespace=tok.trailing_whitespace) 
117          merged.annotation = last.annotation 
118          tokens[-1] = merged 
119       
121      """ 
122      Serialize the list of tokens into a list of text chunks, calling 
123      markup_func around text to add annotations. 
124      """ 
125      for token in tokens: 
126          for pre in token.pre_tags: 
127              yield pre 
128          html = token.html() 
129          html = markup_func(html, token.annotation) 
130          if token.trailing_whitespace: 
131              html += ' ' 
132          yield html 
133          for post in token.post_tags: 
134              yield post 
 135   
136   
137   
138   
139   
140   
142       
143       
144      """ Do a diff of the old and new document.  The documents are HTML 
145      *fragments* (str/UTF8 or unicode), they are not complete documents 
146      (i.e., no <html> tag). 
147   
148      Returns HTML with <ins> and <del> tags added around the 
149      appropriate text.   
150   
151      Markup is generally ignored, with the markup from new_html 
152      preserved, and possibly some markup from old_html (though it is 
153      considered acceptable to lose some of the old markup).  Only the 
154      words in the HTML are diffed.  The exception is <img> tags, which 
155      are treated like words, and the href attribute of <a> tags, which 
156      are noted inside the tag itself when there are changes. 
157      """  
158      old_html_tokens = tokenize(old_html) 
159      new_html_tokens = tokenize(new_html) 
160      result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
161      result = ''.join(result).strip() 
162      return fixup_ins_del_tags(result) 
 163   
165      """ Does a diff on the tokens themselves, returning a list of text 
166      chunks (not tokens). 
167      """ 
168       
169       
170       
171       
172       
173       
174       
175       
176       
177       
178       
179       
180       
181      s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) 
182      commands = s.get_opcodes() 
183      result = [] 
184      for command, i1, i2, j1, j2 in commands: 
185          if command == 'equal': 
186              result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) 
187              continue 
188          if command == 'insert' or command == 'replace': 
189              ins_tokens = expand_tokens(html2_tokens[j1:j2]) 
190              merge_insert(ins_tokens, result) 
191          if command == 'delete' or command == 'replace': 
192              del_tokens = expand_tokens(html1_tokens[i1:i2]) 
193              merge_delete(del_tokens, result) 
194       
195       
196       
197       
198      result = cleanup_delete(result) 
199   
200      return result 
 201   
203      """Given a list of tokens, return a generator of the chunks of 
204      text for the data in the tokens. 
205      """ 
206      for token in tokens: 
207          for pre in token.pre_tags: 
208              yield pre 
209          if not equal or not token.hide_when_equal: 
210              if token.trailing_whitespace: 
211                  yield token.html() + ' ' 
212              else: 
213                  yield token.html() 
214          for post in token.post_tags: 
215              yield post 
 216   
218      """ doc is the already-handled document (as a list of text chunks); 
219      here we add <ins>ins_chunks</ins> to the end of that.  """ 
220       
221       
222       
223      unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) 
224      doc.extend(unbalanced_start) 
225      if doc and not doc[-1].endswith(' '): 
226           
227           
228          doc[-1] += ' ' 
229      doc.append('<ins>') 
230      if balanced and balanced[-1].endswith(' '): 
231           
232          balanced[-1] = balanced[-1][:-1] 
233      doc.extend(balanced) 
234      doc.append('</ins> ') 
235      doc.extend(unbalanced_end) 
 236   
237   
238   
239   
244   
246      """ Raised when the document no longer contains any pending deletes 
247      (DEL_START/DEL_END) """ 
 248   
250      """ Adds the text chunks in del_chunks to the document doc (another 
251      list of text chunks) with marker to show it is a delete. 
252      cleanup_delete later resolves these markers into <del> tags.""" 
253      doc.append(DEL_START) 
254      doc.extend(del_chunks) 
255      doc.append(DEL_END) 
 256   
258      """ Cleans up any DEL_START/DEL_END markers in the document, replacing 
259      them with <del></del>.  To do this while keeping the document 
260      valid, it may need to drop some tags (either start or end tags). 
261   
262      It may also move the del into adjacent tags to try to move it to a 
263      similar location where it was originally located (e.g., moving a 
264      delete into preceding <div> tag, if the del looks like (DEL_START, 
265      'Text</div>', DEL_END)""" 
266      while 1: 
267           
268           
269           
270          try: 
271              pre_delete, delete, post_delete = split_delete(chunks) 
272          except NoDeletes: 
273               
274              break 
275           
276           
277          unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) 
278           
279           
280          locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) 
281          locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) 
282          doc = pre_delete 
283          if doc and not doc[-1].endswith(' '): 
284               
285              doc[-1] += ' ' 
286          doc.append('<del>') 
287          if balanced and balanced[-1].endswith(' '): 
288               
289              balanced[-1] = balanced[-1][:-1] 
290          doc.extend(balanced) 
291          doc.append('</del> ') 
292          doc.extend(post_delete) 
293          chunks = doc 
294      return chunks 
 295   
297      """Return (unbalanced_start, balanced, unbalanced_end), where each is 
298      a list of text and tag chunks. 
299   
300      unbalanced_start is a list of all the tags that are opened, but 
301      not closed in this span.  Similarly, unbalanced_end is a list of 
302      tags that are closed but were not opened.  Extracting these might 
303      mean some reordering of the chunks.""" 
304      start = [] 
305      end = [] 
306      tag_stack = [] 
307      balanced = [] 
308      for chunk in chunks: 
309          if not chunk.startswith('<'): 
310              balanced.append(chunk) 
311              continue 
312          endtag = chunk[1] == '/' 
313          name = chunk.split()[0].strip('<>/') 
314          if name in empty_tags: 
315              balanced.append(chunk) 
316              continue 
317          if endtag: 
318              if tag_stack and tag_stack[-1][0] == name: 
319                  balanced.append(chunk) 
320                  name, pos, tag = tag_stack.pop() 
321                  balanced[pos] = tag 
322              elif tag_stack: 
323                  start.extend([tag for name, pos, tag in tag_stack]) 
324                  tag_stack = [] 
325                  end.append(chunk) 
326              else: 
327                  end.append(chunk) 
328          else: 
329              tag_stack.append((name, len(balanced), chunk)) 
330              balanced.append(None) 
331      start.extend( 
332          [chunk for name, pos, chunk in tag_stack]) 
333      balanced = [chunk for chunk in balanced if chunk is not None] 
334      return start, balanced, end 
 335   
337      """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, 
338      stuff_after_DEL_END).  Returns the first case found (there may be 
339      more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if 
340      there's no DEL_START found. """ 
341      try: 
342          pos = chunks.index(DEL_START) 
343      except ValueError: 
344          raise NoDeletes 
345      pos2 = chunks.index(DEL_END) 
346      return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] 
 347   
349      """ pre_delete and post_delete implicitly point to a place in the 
350      document (where the two were split).  This moves that point (by 
351      popping items from one and pushing them onto the other).  It moves 
352      the point to try to find a place where unbalanced_start applies. 
353   
354      As an example:: 
355   
356          >>> unbalanced_start = ['<div>'] 
357          >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] 
358          >>> pre, post = doc[:3], doc[3:] 
359          >>> pre, post 
360          (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) 
361          >>> locate_unbalanced_start(unbalanced_start, pre, post) 
362          >>> pre, post 
363          (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) 
364   
365      As you can see, we moved the point so that the dangling <div> that 
366      we found will be effectively replaced by the div in the original 
367      document.  If this doesn't work out, we just throw away 
368      unbalanced_start without doing anything. 
369      """ 
370      while 1: 
371          if not unbalanced_start: 
372               
373              break 
374          finding = unbalanced_start[0] 
375          finding_name = finding.split()[0].strip('<>') 
376          if not post_delete: 
377              break 
378          next = post_delete[0] 
379          if next is DEL_START or not next.startswith('<'): 
380               
381              break 
382          if next[1] == '/': 
383               
384              break 
385          name = next.split()[0].strip('<>') 
386          if name == 'ins': 
387               
388              break 
389          assert name != 'del', ( 
390              "Unexpected delete tag: %r" % next) 
391          if name == finding_name: 
392              unbalanced_start.pop(0) 
393              pre_delete.append(post_delete.pop(0)) 
394          else: 
395               
396              break 
 397   
399      """ like locate_unbalanced_start, except handling end tags and 
400      possibly moving the point earlier in the document.  """ 
401      while 1: 
402          if not unbalanced_end: 
403               
404              break 
405          finding = unbalanced_end[-1] 
406          finding_name = finding.split()[0].strip('<>/') 
407          if not pre_delete: 
408              break 
409          next = pre_delete[-1] 
410          if next is DEL_END or not next.startswith('</'): 
411               
412              break 
413          name = next.split()[0].strip('<>/') 
414          if name == 'ins' or name == 'del': 
415               
416              break 
417          if name == finding_name: 
418              unbalanced_end.pop() 
419              post_delete.insert(0, pre_delete.pop()) 
420          else: 
421               
422              break 
 423   
425      """ Represents a diffable token, generally a word that is displayed to 
426      the user.  Opening tags are attached to this token when they are 
427      adjacent (pre_tags) and closing tags that follow the word 
428      (post_tags).  Some exceptions occur when there are empty tags 
429      adjacent to a word, so there may be close tags in pre_tags, or 
430      open tags in post_tags. 
431   
432      We also keep track of whether the word was originally followed by 
433      whitespace, even though we do not want to treat the word as 
434      equivalent to a similar word that does not have a trailing 
435      space.""" 
436   
437       
438       
439      hide_when_equal = False 
440   
441 -    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False): 
 442          obj = unicode.__new__(cls, text) 
443   
444          if pre_tags is not None: 
445              obj.pre_tags = pre_tags 
446          else: 
447              obj.pre_tags = [] 
448   
449          if post_tags is not None: 
450              obj.post_tags = post_tags 
451          else: 
452              obj.post_tags = [] 
453   
454          obj.trailing_whitespace = trailing_whitespace 
455   
456          return obj 
 457   
459          return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags) 
 460   
 463   
465   
466      """ Represents a token that is actually a tag.  Currently this is just 
467      the <img> tag, which takes up visible space just like a word but 
468      is only represented in a document by a tag.  """ 
469   
470 -    def __new__(cls, tag, data, html_repr, pre_tags=None,  
471                  post_tags=None, trailing_whitespace=False): 
 472          obj = token.__new__(cls, "%s: %s" % (type, data),  
473                              pre_tags=pre_tags,  
474                              post_tags=post_tags,  
475                              trailing_whitespace=trailing_whitespace) 
476          obj.tag = tag 
477          obj.data = data 
478          obj.html_repr = html_repr 
479          return obj 
 480   
482          return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( 
483              self.tag,  
484              self.data,  
485              self.html_repr,  
486              self.pre_tags,  
487              self.post_tags,  
488              self.trailing_whitespace) 
 490          return self.html_repr 
  491   
493   
494      """ Represents the href in an anchor tag.  Unlike other words, we only 
495      show the href when it changes.  """ 
496   
497      hide_when_equal = True 
498   
500          return 'Link: %s' % self 
  501   
503      """ 
504      Parse the given HTML and returns token objects (words with attached tags). 
505   
506      This parses only the content of a page; anything in the head is 
507      ignored, and the <head> and <body> elements are themselves 
508      optional.  The content is then parsed by lxml, which ensures the 
509      validity of the resulting parsed document (though lxml may make 
510      incorrect guesses when the markup is particular bad). 
511   
512      <ins> and <del> tags are also eliminated from the document, as 
513      that gets confusing. 
514   
515      If include_hrefs is true, then the href attribute of <a> tags is 
516      included as a special kind of diffable token.""" 
517      body_el = parse_html(html, cleanup=True) 
518       
519      chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) 
520       
521      return fixup_chunks(chunks) 
 522   
524      """ 
525      Parses an HTML fragment, returning an lxml element.  Note that the HTML will be 
526      wrapped in a <div> tag that was not in the original document. 
527   
528      If cleanup is true, make sure there's no <head> or <body>, and get 
529      rid of any <ins> and <del> tags. 
530      """ 
531      if cleanup: 
532           
533          html = cleanup_html(html) 
534      return fragment_fromstring(html, create_parent=True) 
 535   
536  _body_re = re.compile(r'<body.*?>', re.I|re.S) 
537  _end_body_re = re.compile(r'</body.*?>', re.I|re.S) 
538  _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) 
539   
541      """ This 'cleans' the HTML, meaning that any page structure is removed 
542      (only the contents of <body> are used, if there is any <body). 
543      Also <ins> and <del> tags are removed.  """ 
544      match = _body_re.search(html) 
545      if match: 
546          html = html[match.end():] 
547      match = _end_body_re.search(html) 
548      if match: 
549          html = html[:match.start()] 
550      html = _ins_del_re.sub('', html) 
551      return html 
 552       
553   
554  end_whitespace_re = re.compile(r'[ \t\n\r]$') 
555   
557      """ 
558      This function takes a list of chunks and produces a list of tokens. 
559      """ 
560      tag_accum = [] 
561      cur_word = None 
562      result = [] 
563      for chunk in chunks: 
564          if isinstance(chunk, tuple): 
565              if chunk[0] == 'img': 
566                  src = chunk[1] 
567                  tag = chunk[2] 
568                  if tag.endswith(' '): 
569                      tag = tag[:-1] 
570                      trailing_whitespace = True 
571                  else: 
572                      trailing_whitespace = False 
573                  cur_word = tag_token('img', src, html_repr=tag, 
574                                       pre_tags=tag_accum, 
575                                       trailing_whitespace=trailing_whitespace) 
576                  tag_accum = [] 
577                  result.append(cur_word) 
578              elif chunk[0] == 'href': 
579                  href = chunk[1] 
580                  cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) 
581                  tag_accum = [] 
582                  result.append(cur_word) 
583              continue 
584          if is_word(chunk): 
585              if chunk.endswith(' '): 
586                  chunk = chunk[:-1] 
587                  trailing_whitespace = True 
588              else: 
589                  trailing_whitespace = False 
590              cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) 
591              tag_accum = [] 
592              result.append(cur_word) 
593          elif is_start_tag(chunk): 
594              tag_accum.append(chunk) 
595          elif is_end_tag(chunk): 
596              if tag_accum: 
597                  tag_accum.append(chunk) 
598              else: 
599                  assert cur_word, ( 
600                      "Weird state, cur_word=%r, result=%r, chunks=%r of %r" 
601                      % (cur_word, result, chunk, chunks)) 
602                  cur_word.post_tags.append(chunk) 
603          else: 
604              assert(0) 
605   
606      if not result: 
607          return [token('', pre_tags=tag_accum)] 
608      else: 
609          result[-1].post_tags.extend(tag_accum) 
610   
611      return result 
 612   
613   
614   
615  empty_tags = ( 
616      'param', 'img', 'area', 'br', 'basefont', 'input', 
617      'base', 'meta', 'link', 'col') 
618   
619  block_level_tags = ( 
620      'address', 
621      'blockquote', 
622      'center', 
623      'dir', 
624      'div', 
625      'dl', 
626      'fieldset', 
627      'form', 
628      'h1', 
629      'h2', 
630      'h3', 
631      'h4', 
632      'h5', 
633      'h6', 
634      'hr', 
635      'isindex', 
636      'menu', 
637      'noframes', 
638      'noscript', 
639      'ol', 
640      'p', 
641      'pre', 
642      'table', 
643      'ul', 
644      ) 
645   
646  block_level_container_tags = ( 
647      'dd', 
648      'dt', 
649      'frameset', 
650      'li', 
651      'tbody', 
652      'td', 
653      'tfoot', 
654      'th', 
655      'thead', 
656      'tr', 
657      ) 
658   
659   
660 -def flatten_el(el, include_hrefs, skip_tag=False): 
 661      """ Takes an lxml element el, and generates all the text chunks for 
662      that tag.  Each start tag is a chunk, each word is a chunk, and each 
663      end tag is a chunk. 
664   
665      If skip_tag is true, then the outermost container tag is 
666      not returned (just its contents).""" 
667      if not skip_tag: 
668          if el.tag == 'img': 
669              yield ('img', el.attrib['src'], start_tag(el)) 
670          else: 
671              yield start_tag(el) 
672      if el.tag in empty_tags and not el.text and not len(el) and not el.tail: 
673          return 
674      start_words = split_words(el.text) 
675      for word in start_words: 
676          yield cgi.escape(word) 
677      for child in el: 
678          for item in flatten_el(child, include_hrefs=include_hrefs): 
679              yield item 
680      if el.tag == 'a' and el.attrib.get('href') and include_hrefs: 
681          yield ('href', el.attrib['href']) 
682      if not skip_tag: 
683          yield end_tag(el) 
684          end_words = split_words(el.tail) 
685          for word in end_words: 
686              yield cgi.escape(word) 
 687   
689      """ Splits some text into words. Includes trailing whitespace (one 
690      space) on each word when appropriate.  """ 
691      if not text or not text.strip(): 
692          return [] 
693      words = [w + ' ' for w in text.strip().split()] 
694      if not end_whitespace_re.search(text): 
695          words[-1] = words[-1][:-1] 
696      return words 
 697   
698  start_whitespace_re = re.compile(r'^[ \t\n\r]') 
699   
701      """ 
702      The text representation of the start tag for a tag. 
703      """ 
704      return '<%s%s>' % ( 
705          el.tag, ''.join([' %s="%s"' % (name, cgi.escape(value, True)) 
706                           for name, value in el.attrib.items()])) 
 707   
709      """ The text representation of an end tag for a tag.  Includes 
710      trailing whitespace when appropriate.  """ 
711      if el.tail and start_whitespace_re.search(el.tail): 
712          extra = ' ' 
713      else: 
714          extra = '' 
715      return '</%s>%s' % (el.tag, extra) 
 716   
718      return not tok.startswith('<') 
 719   
721      return tok.startswith('</') 
 722   
724      return tok.startswith('<') and not tok.startswith('</') 
 725   
734   
736      """ Serialize a single lxml element as HTML.  The serialized form 
737      includes the elements tail.   
738   
739      If skip_outer is true, then don't serialize the outermost tag 
740      """ 
741      assert not isinstance(el, basestring), ( 
742          "You should pass in an element, not a string like %r" % el) 
743      html = etree.tostring(el, method="html", encoding="UTF-8") 
744      if skip_outer: 
745           
746          html = html[html.find('>')+1:] 
747           
748          html = html[:html.rfind('<')] 
749          return html.strip() 
750      else: 
751          return html 
 752   
762               
763   
765      """True if the element contains any block-level elements, like <p>, <td>, etc. 
766      """ 
767      if el.tag in block_level_tags or el.tag in block_level_container_tags: 
768          return True 
769      for child in el: 
770          if _contains_block_level_tag(child): 
771              return True 
772      return False 
 773   
775      """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags 
776      and moves them inside any block-level tags.  """ 
777      for child in el: 
778          if _contains_block_level_tag(child): 
779              break 
780      else: 
781          import sys 
782           
783          children_tag = etree.Element(tag) 
784          children_tag.text = el.text 
785          el.text = None 
786          children_tag.extend(list(el)) 
787          el[:] = [children_tag] 
788          return 
789      for child in list(el): 
790          if _contains_block_level_tag(child): 
791              _move_el_inside_block(child, tag) 
792              if child.tail: 
793                  tail_tag = etree.Element(tag) 
794                  tail_tag.text = child.tail 
795                  child.tail = None 
796                  el.insert(el.index(child)+1, tail_tag) 
797          else: 
798              child_tag = etree.Element(tag) 
799              el.replace(child, child_tag) 
800              child_tag.append(child) 
801      if el.text: 
802          text_tag = etree.Element(tag) 
803          text_tag.text = el.text 
804          el.text = None 
805          el.insert(0, text_tag) 
 806               
808      """ 
809      Removes an element, but merges its contents into its place, e.g., 
810      given <p>Hi <i>there!</i></p>, if you remove the <i> element you get 
811      <p>Hi there!</p> 
812      """ 
813      parent = el.getparent() 
814      text = el.text or '' 
815      if el.tail: 
816          if not len(el): 
817              text += el.tail 
818          else: 
819              if el[-1].tail: 
820                  el[-1].tail += el.tail 
821              else: 
822                  el[-1].tail = el.tail 
823      index = parent.index(el) 
824      if text: 
825          if index == 0: 
826              previous = None 
827          else: 
828              previous = parent[index-1] 
829          if previous is None: 
830              if parent.text: 
831                  parent.text += text 
832              else: 
833                  parent.text = text 
834          else: 
835              if previous.tail: 
836                  previous.tail += text 
837              else: 
838                  previous.tail = text 
839      parent[index:index+1] = el.getchildren() 
 840   
842      """ 
843      Acts like SequenceMatcher, but tries not to find very small equal 
844      blocks amidst large spans of changes 
845      """ 
846   
847      threshold = 2 
848       
850          size = min(len(self.b), len(self.b)) 
851          threshold = min(self.threshold, size / 4) 
852          actual = difflib.SequenceMatcher.get_matching_blocks(self) 
853          return [item for item in actual 
854                  if item[2] > threshold 
855                  or not item[2]] 
  856   
857  if __name__ == '__main__': 
858      from lxml.html import _diffcommand 
859      _diffcommand.main() 
860